diff options
Diffstat (limited to 'src/plugins')
226 files changed, 85512 insertions, 0 deletions
diff --git a/src/plugins/Makefile.am b/src/plugins/Makefile.am new file mode 100644 index 00000000..205bfe6d --- /dev/null +++ b/src/plugins/Makefile.am @@ -0,0 +1,101 @@ + +# Copyright (c) <current-year> <your-organization> +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +AUTOMAKE_OPTIONS = foreign subdir-objects + +AM_CFLAGS = -Wall -I${top_srcdir} -I${top_builddir} +AM_LDFLAGS = -module -shared -avoid-version +AM_LIBTOOLFLAGS = --quiet +SUFFIXES = .api.h .api .api.json +API_FILES = +BUILT_SOURCES = +vppplugins_LTLIBRARIES = +vppapitestplugins_LTLIBRARIES = +noinst_HEADERS = +nobase_apiinclude_HEADERS = +nobase_include_HEADERS = + +vppapitestpluginsdir = ${libdir}/vpp_api_test_plugins +vpppluginsdir = ${libdir}/vpp_plugins + +if ENABLE_ACL_PLUGIN +include acl.am +endif + +if ENABLE_DPDK_PLUGIN +include dpdk.am +endif + +if ENABLE_FLOWPROBE_PLUGIN +include flowprobe.am +endif + + +if ENABLE_GTPU_PLUGIN +include gtpu.am +endif + +if ENABLE_ILA_PLUGIN +include ila.am +endif + +if ENABLE_IOAM_PLUGIN +include ioam.am +endif + +if ENABLE_IXGE_PLUGIN +include ixge.am +endif + +if ENABLE_LB_PLUGIN +include lb.am +endif + +if ENABLE_MEMIF_PLUGIN +include memif.am +endif + +if ENABLE_PPPOE_PLUGIN +include pppoe.am +endif + +if ENABLE_SIXRD_PLUGIN +include sixrd.am +endif + +if ENABLE_NAT_PLUGIN +include nat.am +endif + +include ../suffix-rules.mk + +# Remove *.la files +install-data-hook: + @-(cd $(vpppluginsdir) && $(RM) $(vppplugins_LTLIBRARIES)) + @-(cd $(vppapitestpluginsdir) && $(RM) $(vppapitestplugins_LTLIBRARIES)) + +############################################################################### +# API +############################################################################### + +apidir = $(prefix)/share/vpp/api/plugins +apiincludedir = ${includedir}/vpp_plugins + +api_DATA = \ + $(patsubst %.api,%.api.json,$(API_FILES)) + +BUILT_SOURCES += \ + $(patsubst %.api,%.api.h,$(API_FILES)) + +CLEANFILES = $(BUILT_SOURCES) $(api_DATA) diff --git a/src/plugins/acl.am b/src/plugins/acl.am new file mode 100644 index 00000000..0a414481 --- /dev/null +++ b/src/plugins/acl.am @@ -0,0 +1,35 @@ +# Copyright (c) 2016 Cisco and/or its affiliates. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +vppapitestplugins_LTLIBRARIES += acl_test_plugin.la +vppplugins_LTLIBRARIES += acl_plugin.la + +acl_plugin_la_SOURCES = \ + acl/acl.c \ + acl/hash_lookup.c \ + acl/fa_node.c \ + acl/l2sess.h \ + acl/manual_fns.h \ + acl/acl_plugin.api.h + +API_FILES += acl/acl.api + +nobase_apiinclude_HEADERS += \ + acl/acl_all_api_h.h \ + acl/acl_msg_enum.h \ + acl/manual_fns.h \ + acl/acl.api.h + +acl_test_plugin_la_SOURCES = acl/acl_test.c acl/acl_plugin.api.h acl/acl_all_api.h + +# vi:syntax=automake diff --git a/src/plugins/acl/acl.api b/src/plugins/acl/acl.api new file mode 100644 index 00000000..a0de24a2 --- /dev/null +++ b/src/plugins/acl/acl.api @@ -0,0 +1,477 @@ +/* Hey Emacs use -*- mode: C -*- */ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** \file + This file defines the vpp control-plane API messages + used to control the ACL plugin +*/ + + +/** \brief Get the plugin version + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request +*/ + +define acl_plugin_get_version +{ + u32 client_index; + u32 context; +}; + +/** \brief Reply to get the plugin version + @param context - returned sender context, to match reply w/ request + @param major - Incremented every time a known breaking behavior change is introduced + @param minor - Incremented with small changes, may be used to avoid buggy versions +*/ + +define acl_plugin_get_version_reply +{ + u32 context; + u32 major; + u32 minor; +}; + +/** \brief Control ping from client to api server request + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request +*/ +define acl_plugin_control_ping +{ + u32 client_index; + u32 context; +}; + +/** \brief Control ping from the client to the server response + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param retval - return code for the request + @param vpe_pid - the pid of the vpe, returned by the server +*/ +define acl_plugin_control_ping_reply +{ + u32 context; + i32 retval; + u32 client_index; + u32 vpe_pid; +}; + +/** \brief Access List Rule entry + @param is_permit - deny (0), permit (1), or permit+reflect(2) action on this rule. + @param is_ipv6 - IP addresses in this rule are IPv6 (1) or IPv4 (0) + @param src_ip_addr - Source prefix value + @param src_ip_prefix_len - Source prefix length + @param dst_ip_addr - Destination prefix value + @param dst_ip_prefix_len - Destination prefix length + @param proto - L4 protocol (http://www.iana.org/assignments/protocol-numbers/protocol-numbers.xhtml) + @param srcport_or_icmptype_first - beginning of source port or ICMP4/6 type range + @param srcport_or_icmptype_last - end of source port or ICMP4/6 type range + @param dstport_or_icmpcode_first - beginning of destination port or ICMP4/6 code range + @param dstport_or_icmpcode_last - end of destination port or ICMP4/6 code range + @param tcp_flags_mask - if proto==6, match masked TCP flags with this value + @param tcp_flags_value - if proto==6, mask to AND the TCP flags in the packet with +*/ + +typeonly manual_print define acl_rule +{ + u8 is_permit; + u8 is_ipv6; + u8 src_ip_addr[16]; + u8 src_ip_prefix_len; + u8 dst_ip_addr[16]; + u8 dst_ip_prefix_len; +/* + * L4 protocol. IANA number. 1 = ICMP, 58 = ICMPv6, 6 = TCP, 17 = UDP. + * 0 => ignore L4 and ignore the ports/tcpflags when matching. + */ + u8 proto; +/* + * If the L4 protocol is TCP or UDP, the below + * hold ranges of ports, else if the L4 is ICMP/ICMPv6 + * they hold ranges of ICMP(v6) types/codes. + * + * Ranges are inclusive, i.e. to match "any" TCP/UDP port, + * use first=0,last=65535. For ICMP(v6), + * use first=0,last=255. + */ + u16 srcport_or_icmptype_first; + u16 srcport_or_icmptype_last; + u16 dstport_or_icmpcode_first; + u16 dstport_or_icmpcode_last; +/* + * for proto = 6, this matches if the + * TCP flags in the packet, ANDed with tcp_flags_mask, + * is equal to tcp_flags_value. + */ + u8 tcp_flags_mask; + u8 tcp_flags_value; +}; + +/** \brief MACIP Access List Rule entry + @param is_permit - deny (0), permit (1) action on this rule. + @param is_ipv6 - IP addresses in this rule are IPv6 (1) or IPv4 (0) + @param src_mac - match masked source MAC address against this value + @param src_mac_mask - AND source MAC address with this value before matching + @param src_ip_addr - Source prefix value + @param src_ip_prefix_len - Source prefix length +*/ + +typeonly manual_print define macip_acl_rule +{ + u8 is_permit; + u8 is_ipv6; +/* + * The source mac of the packet ANDed with src_mac_mask. + * The source ip[46] address in the packet is matched + * against src_ip_addr, with src_ip_prefix_len set to 0. + * + * For better performance, minimize the number of + * (src_mac_mask, src_ip_prefix_len) combinations + * in a MACIP ACL. + */ + u8 src_mac[6]; + u8 src_mac_mask[6]; + u8 src_ip_addr[16]; + u8 src_ip_prefix_len; +}; + +/** \brief Replace an existing ACL in-place or create a new ACL + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param acl_index - an existing ACL entry (0..0xfffffffe) to replace, or 0xffffffff to make new ACL + @param tag - a string value stored along with the ACL, for descriptive purposes + @param count - number of ACL rules + @r - Rules for this access-list +*/ + +manual_print manual_endian define acl_add_replace +{ + u32 client_index; + u32 context; + u32 acl_index; /* ~0 to add, existing ACL# to replace */ + u8 tag[64]; /* What gets in here gets out in the corresponding tag field when dumping the ACLs. */ + u32 count; + vl_api_acl_rule_t r[count]; +}; + +/** \brief Reply to add/replace ACL + @param context - returned sender context, to match reply w/ request + @param acl_index - index of the updated or newly created ACL + @param retval 0 - no error +*/ + +define acl_add_replace_reply +{ + u32 context; + u32 acl_index; + i32 retval; +}; + +/** \brief Delete an ACL + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param acl_index - ACL index to delete +*/ + +autoreply manual_print define acl_del +{ + u32 client_index; + u32 context; + u32 acl_index; +}; + +/* acl_interface_add_del(_reply) to be deprecated in lieu of acl_interface_set_acl_list */ +/** \brief Use acl_interface_set_acl_list instead + Append/remove an ACL index to/from the list of ACLs checked for an interface + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param is_add - add or delete the ACL index from the list + @param is_input - check the ACL on input (1) or output (0) + @param sw_if_index - the interface to alter the list of ACLs on + @param acl_index - index of ACL for the operation +*/ + +autoreply manual_print define acl_interface_add_del +{ + u32 client_index; + u32 context; + u8 is_add; +/* + * is_input = 0 => ACL applied on interface egress + * is_input = 1 => ACL applied on interface ingress + */ + u8 is_input; + u32 sw_if_index; + u32 acl_index; +}; + +/** \brief Set the vector of input/output ACLs checked for an interface + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param sw_if_index - the interface to alter the list of ACLs on + @param count - total number of ACL indices in the vector + @param n_input - this many first elements correspond to input ACLs, the rest - output + @param acls - vector of ACL indices +*/ + +autoreply manual_print define acl_interface_set_acl_list +{ + u32 client_index; + u32 context; + u32 sw_if_index; + u8 count; + u8 n_input; /* First n_input ACLs are set as a list of input ACLs, the rest are applied as output */ + u32 acls[count]; +}; + +/** \brief Reply to set the ACL list on an interface + @param context - returned sender context, to match reply w/ request + @param retval 0 - no error +*/ + +/** \brief Dump the specific ACL contents or all of the ACLs' contents + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param acl_index - ACL index to dump, ~0 to dump all ACLs +*/ + +define acl_dump +{ + u32 client_index; + u32 context; + u32 acl_index; /* ~0 for all ACLs */ +}; + +/** \brief Details about a single ACL contents + @param context - returned sender context, to match reply w/ request + @param acl_index - ACL index whose contents are being sent in this message + @param tag - Descriptive tag value which was supplied at ACL creation + @param count - Number of rules in this ACL + @param r - Array of rules within this ACL +*/ + +manual_endian manual_print define acl_details +{ + u32 context; + u32 acl_index; + u8 tag[64]; /* Same blob that was supplied to us when creating the ACL, one hopes. */ + u32 count; + vl_api_acl_rule_t r[count]; +}; + +/** \brief Dump the list(s) of ACL applied to specific or all interfaces + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param sw_if_index - interface to dump the ACL list for +*/ + +define acl_interface_list_dump +{ + u32 client_index; + u32 context; + u32 sw_if_index; /* ~0 for all interfaces */ +}; + +/** \brief Details about a single ACL contents + @param context - returned sender context, to match reply w/ request + @param sw_if_index - interface for which the list of ACLs is applied + @param count - total length of acl indices vector + @param n_input - this many of indices in the beginning are input ACLs, the rest - output + @param acls - the vector of ACL indices +*/ + +define acl_interface_list_details +{ + u32 context; + u32 sw_if_index; + u8 count; + u8 n_input; + u32 acls[count]; +}; + +/** \brief Add a MACIP ACL + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param tag - descriptive value for this MACIP ACL + @param count - number of rules in this MACIP ACL + @param r - vector of MACIP ACL rules +*/ + +manual_endian manual_print define macip_acl_add +{ + u32 client_index; + u32 context; + u8 tag[64]; + u32 count; + vl_api_macip_acl_rule_t r[count]; +}; + +/** \brief Reply to add MACIP ACL + @param context - returned sender context, to match reply w/ request + @param acl_index - index of the newly created MACIP ACL + @param retval 0 - no error +*/ + +define macip_acl_add_reply +{ + u32 context; + u32 acl_index; + i32 retval; +}; + +/** \brief Add/Replace a MACIP ACL + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param acl_index - an existing MACIP ACL entry (0..0xfffffffe) to replace, or 0xffffffff to make new MACIP ACL + @param tag - descriptive value for this MACIP ACL + @param count - number of rules in this MACIP ACL + @param r - vector of MACIP ACL rules +*/ + +manual_endian manual_print define macip_acl_add_replace +{ + u32 client_index; + u32 context; + u32 acl_index; /* ~0 to add, existing MACIP ACL# to replace */ + u8 tag[64]; + u32 count; + vl_api_macip_acl_rule_t r[count]; +}; + +/** \brief Reply to add/replace MACIP ACL + @param context - returned sender context, to match reply w/ request + @param acl_index - index of the newly created MACIP ACL + @param retval 0 - no error +*/ + +define macip_acl_add_replace_reply +{ + u32 context; + u32 acl_index; + i32 retval; +}; + +/** \brief Delete a MACIP ACL + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param acl_index - MACIP ACL index to delete +*/ + +autoreply manual_print define macip_acl_del +{ + u32 client_index; + u32 context; + u32 acl_index; +}; + +/** \brief Add or delete a MACIP ACL to/from interface + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param is_add - add (1) or delete (0) MACIP ACL from being used on an interface + @param sw_if_index - interface to apply the action to + @param acl_index - MACIP ACL index +*/ + +autoreply manual_print define macip_acl_interface_add_del +{ + u32 client_index; + u32 context; + u8 is_add; + /* MACIP ACLs are always input */ + u32 sw_if_index; + u32 acl_index; +}; + +/** \brief Dump one or all defined MACIP ACLs + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param acl_index - MACIP ACL index or ~0 to dump all MACIP ACLs +*/ + +define macip_acl_dump +{ + u32 client_index; + u32 context; + u32 acl_index; /* ~0 for all ACLs */ +}; + +/** \brief Details about one MACIP ACL + @param context - returned sender context, to match reply w/ request + @param acl_index - index of this MACIP ACL + @param tag - descriptive tag which was supplied during the creation + @param count - length of the vector of MACIP ACL rules + @param r - rules comprising this MACIP ACL +*/ + +manual_endian manual_print define macip_acl_details +{ + u32 context; + u32 acl_index; + u8 tag[64]; + u32 count; + vl_api_macip_acl_rule_t r[count]; +}; + +/** \brief Get the vector of MACIP ACL IDs applied to the interfaces + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request +*/ + +define macip_acl_interface_get +{ + u32 client_index; + u32 context; +}; + +/** \brief Reply with the vector of MACIP ACLs by sw_if_index + @param context - returned sender context, to match reply w/ request + @param count - total number of elements in the vector + @param acls - the vector of active MACIP ACL indices per sw_if_index +*/ + +define macip_acl_interface_get_reply +{ + u32 context; + u32 count; + u32 acls[count]; +}; + +/** \brief Dump the list(s) of MACIP ACLs applied to specific or all interfaces + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param sw_if_index - interface to dump the MACIP ACL list for +*/ + +define macip_acl_interface_list_dump +{ + u32 client_index; + u32 context; + u32 sw_if_index; /* ~0 for all interfaces */ +}; + +/** \brief Details about a single MACIP ACL contents + @param context - returned sender context, to match reply w/ request + @param sw_if_index - interface for which the list of MACIP ACLs is applied + @param count - total length of acl indices vector + @param acls - the vector of MACIP ACL indices +*/ + +define macip_acl_interface_list_details +{ + u32 context; + u32 sw_if_index; + u8 count; + u32 acls[count]; +}; diff --git a/src/plugins/acl/acl.c b/src/plugins/acl/acl.c new file mode 100644 index 00000000..efd506de --- /dev/null +++ b/src/plugins/acl/acl.c @@ -0,0 +1,2709 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <stddef.h> + +#include <vnet/vnet.h> +#include <vnet/plugin/plugin.h> +#include <acl/acl.h> + +#include <vnet/l2/l2_classify.h> +#include <vnet/classify/input_acl.h> +#include <vpp/app/version.h> + +#include <vlibapi/api.h> +#include <vlibmemory/api.h> +#include <vlibsocket/api.h> + +/* define message IDs */ +#include <acl/acl_msg_enum.h> + +/* define message structures */ +#define vl_typedefs +#include <acl/acl_all_api_h.h> +#undef vl_typedefs + +/* define generated endian-swappers */ +#define vl_endianfun +#include <acl/acl_all_api_h.h> +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__) +#define vl_printfun +#include <acl/acl_all_api_h.h> +#undef vl_printfun + +/* Get the API version number */ +#define vl_api_version(n,v) static u32 api_version=(v); +#include <acl/acl_all_api_h.h> +#undef vl_api_version + +#include "fa_node.h" +#include "hash_lookup.h" + +acl_main_t acl_main; + +#define REPLY_MSG_ID_BASE am->msg_id_base +#include <vlibapi/api_helper_macros.h> + +/* List of message types that this plugin understands */ + +#define foreach_acl_plugin_api_msg \ +_(ACL_PLUGIN_GET_VERSION, acl_plugin_get_version) \ +_(ACL_PLUGIN_CONTROL_PING, acl_plugin_control_ping) \ +_(ACL_ADD_REPLACE, acl_add_replace) \ +_(ACL_DEL, acl_del) \ +_(ACL_INTERFACE_ADD_DEL, acl_interface_add_del) \ +_(ACL_INTERFACE_SET_ACL_LIST, acl_interface_set_acl_list) \ +_(ACL_DUMP, acl_dump) \ +_(ACL_INTERFACE_LIST_DUMP, acl_interface_list_dump) \ +_(MACIP_ACL_ADD, macip_acl_add) \ +_(MACIP_ACL_ADD_REPLACE, macip_acl_add_replace) \ +_(MACIP_ACL_DEL, macip_acl_del) \ +_(MACIP_ACL_INTERFACE_ADD_DEL, macip_acl_interface_add_del) \ +_(MACIP_ACL_DUMP, macip_acl_dump) \ +_(MACIP_ACL_INTERFACE_GET, macip_acl_interface_get) \ +_(MACIP_ACL_INTERFACE_LIST_DUMP, macip_acl_interface_list_dump) + + +/* *INDENT-OFF* */ +VLIB_PLUGIN_REGISTER () = { + .version = VPP_BUILD_VER, + .description = "Access Control Lists", +}; +/* *INDENT-ON* */ + + +static void * +acl_set_heap(acl_main_t *am) +{ + if (0 == am->acl_mheap) { + am->acl_mheap = mheap_alloc (0 /* use VM */ , am->acl_mheap_size); + mheap_t *h = mheap_header (am->acl_mheap); + h->flags |= MHEAP_FLAG_THREAD_SAFE; + } + void *oldheap = clib_mem_set_heap(am->acl_mheap); + return oldheap; +} + +void +acl_plugin_acl_set_validate_heap(acl_main_t *am, int on) +{ + clib_mem_set_heap(acl_set_heap(am)); + mheap_t *h = mheap_header (am->acl_mheap); + if (on) { + h->flags |= MHEAP_FLAG_VALIDATE; + h->flags &= ~MHEAP_FLAG_SMALL_OBJECT_CACHE; + mheap_validate(h); + } else { + h->flags &= ~MHEAP_FLAG_VALIDATE; + h->flags |= MHEAP_FLAG_SMALL_OBJECT_CACHE; + } +} + +void +acl_plugin_acl_set_trace_heap(acl_main_t *am, int on) +{ + clib_mem_set_heap(acl_set_heap(am)); + mheap_t *h = mheap_header (am->acl_mheap); + if (on) { + h->flags |= MHEAP_FLAG_TRACE; + } else { + h->flags &= ~MHEAP_FLAG_TRACE; + } +} + +static void +vl_api_acl_plugin_get_version_t_handler (vl_api_acl_plugin_get_version_t * mp) +{ + acl_main_t *am = &acl_main; + vl_api_acl_plugin_get_version_reply_t *rmp; + int msg_size = sizeof (*rmp); + unix_shared_memory_queue_t *q; + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (q == 0) + { + return; + } + + rmp = vl_msg_api_alloc (msg_size); + memset (rmp, 0, msg_size); + rmp->_vl_msg_id = + ntohs (VL_API_ACL_PLUGIN_GET_VERSION_REPLY + am->msg_id_base); + rmp->context = mp->context; + rmp->major = htonl (ACL_PLUGIN_VERSION_MAJOR); + rmp->minor = htonl (ACL_PLUGIN_VERSION_MINOR); + + vl_msg_api_send_shmem (q, (u8 *) & rmp); +} + +static void +vl_api_acl_plugin_control_ping_t_handler (vl_api_acl_plugin_control_ping_t * mp) +{ + vl_api_acl_plugin_control_ping_reply_t *rmp; + acl_main_t *am = &acl_main; + int rv = 0; + + /* *INDENT-OFF* */ + REPLY_MACRO2 (VL_API_ACL_PLUGIN_CONTROL_PING_REPLY, + ({ + rmp->vpe_pid = ntohl (getpid ()); + })); + /* *INDENT-ON* */ +} + +static int +acl_add_list (u32 count, vl_api_acl_rule_t rules[], + u32 * acl_list_index, u8 * tag) +{ + acl_main_t *am = &acl_main; + acl_list_t *a; + acl_rule_t *r; + acl_rule_t *acl_new_rules = 0; + int i; + + if (*acl_list_index != ~0) + { + /* They supplied some number, let's see if this ACL exists */ + if (pool_is_free_index (am->acls, *acl_list_index)) + { + /* tried to replace a non-existent ACL, no point doing anything */ + clib_warning("acl-plugin-error: Trying to replace nonexistent ACL %d (tag %s)", *acl_list_index, tag); + return -1; + } + } + if (0 == count) { + clib_warning("acl-plugin-warning: supplied no rules for ACL %d (tag %s)", *acl_list_index, tag); + } + + void *oldheap = acl_set_heap(am); + + /* Create and populate the rules */ + if (count > 0) + vec_validate(acl_new_rules, count-1); + + for (i = 0; i < count; i++) + { + r = vec_elt_at_index(acl_new_rules, i); + memset(r, 0, sizeof(*r)); + r->is_permit = rules[i].is_permit; + r->is_ipv6 = rules[i].is_ipv6; + if (r->is_ipv6) + { + memcpy (&r->src, rules[i].src_ip_addr, sizeof (r->src)); + memcpy (&r->dst, rules[i].dst_ip_addr, sizeof (r->dst)); + } + else + { + memcpy (&r->src.ip4, rules[i].src_ip_addr, sizeof (r->src.ip4)); + memcpy (&r->dst.ip4, rules[i].dst_ip_addr, sizeof (r->dst.ip4)); + } + r->src_prefixlen = rules[i].src_ip_prefix_len; + r->dst_prefixlen = rules[i].dst_ip_prefix_len; + r->proto = rules[i].proto; + r->src_port_or_type_first = ntohs ( rules[i].srcport_or_icmptype_first ); + r->src_port_or_type_last = ntohs ( rules[i].srcport_or_icmptype_last ); + r->dst_port_or_code_first = ntohs ( rules[i].dstport_or_icmpcode_first ); + r->dst_port_or_code_last = ntohs ( rules[i].dstport_or_icmpcode_last ); + r->tcp_flags_value = rules[i].tcp_flags_value; + r->tcp_flags_mask = rules[i].tcp_flags_mask; + } + + if (~0 == *acl_list_index) + { + /* Get ACL index */ + pool_get_aligned (am->acls, a, CLIB_CACHE_LINE_BYTES); + memset (a, 0, sizeof (*a)); + /* Will return the newly allocated ACL index */ + *acl_list_index = a - am->acls; + } + else + { + a = am->acls + *acl_list_index; + hash_acl_delete(am, *acl_list_index); + /* Get rid of the old rules */ + if (a->rules) + vec_free (a->rules); + } + a->rules = acl_new_rules; + a->count = count; + memcpy (a->tag, tag, sizeof (a->tag)); + hash_acl_add(am, *acl_list_index); + clib_mem_set_heap (oldheap); + return 0; +} + +static int +acl_del_list (u32 acl_list_index) +{ + acl_main_t *am = &acl_main; + acl_list_t *a; + int i, ii; + if (pool_is_free_index (am->acls, acl_list_index)) + { + return -1; + } + + if (acl_list_index < vec_len(am->input_sw_if_index_vec_by_acl)) { + if (vec_len(vec_elt(am->input_sw_if_index_vec_by_acl, acl_list_index)) > 0) { + /* ACL is applied somewhere inbound. Refuse to delete */ + return -1; + } + } + if (acl_list_index < vec_len(am->output_sw_if_index_vec_by_acl)) { + if (vec_len(vec_elt(am->output_sw_if_index_vec_by_acl, acl_list_index)) > 0) { + /* ACL is applied somewhere outbound. Refuse to delete */ + return -1; + } + } + + void *oldheap = acl_set_heap(am); + /* delete any references to the ACL */ + for (i = 0; i < vec_len (am->output_acl_vec_by_sw_if_index); i++) + { + for (ii = 0; ii < vec_len (am->output_acl_vec_by_sw_if_index[i]); + /* see body */ ) + { + if (acl_list_index == am->output_acl_vec_by_sw_if_index[i][ii]) + { + vec_del1 (am->output_acl_vec_by_sw_if_index[i], ii); + } + else + { + ii++; + } + } + } + for (i = 0; i < vec_len (am->input_acl_vec_by_sw_if_index); i++) + { + for (ii = 0; ii < vec_len (am->input_acl_vec_by_sw_if_index[i]); + /* see body */ ) + { + if (acl_list_index == am->input_acl_vec_by_sw_if_index[i][ii]) + { + vec_del1 (am->input_acl_vec_by_sw_if_index[i], ii); + } + else + { + ii++; + } + } + } + /* delete the hash table data */ + + hash_acl_delete(am, acl_list_index); + /* now we can delete the ACL itself */ + a = pool_elt_at_index (am->acls, acl_list_index); + if (a->rules) + vec_free (a->rules); + + pool_put (am->acls, a); + clib_mem_set_heap (oldheap); + return 0; +} + +/* Some aids in ASCII graphing the content */ +#define XX "\377" +#define __ "\000" +#define _(x) +#define v + +u8 ip4_5tuple_mask[] = +_(" dmac smac etype ") +_(ether) __ __ __ __ __ __ v __ __ __ __ __ __ v __ __ v + _(" v ihl totlen ") + _(0x0000) + __ __ __ __ + _(" ident fl+fo ") + _(0x0004) + __ __ __ __ + _(" ttl pr checksum ") + _(0x0008) + __ XX __ __ + _(" src address ") + _(0x000C) + XX XX XX XX + _(" dst address ") + _(0x0010) + XX XX XX XX + _("L4 T/U sport dport ") + _(tcpudp) + XX XX XX XX + _(padpad) + __ __ __ __ + _(padpad) + __ __ __ __ + _(padeth) + __ __; + + u8 ip6_5tuple_mask[] = + _(" dmac smac etype ") + _(ether) __ __ __ __ __ __ v __ __ __ __ __ __ v __ __ v + _(" v tc + flow ") + _(0x0000) __ __ __ __ + _(" plen nh hl ") + _(0x0004) __ __ XX __ + _(" src address ") + _(0x0008) XX XX XX XX + _(0x000C) XX XX XX XX + _(0x0010) XX XX XX XX + _(0x0014) XX XX XX XX + _(" dst address ") + _(0x0018) XX XX XX XX + _(0x001C) XX XX XX XX + _(0x0020) XX XX XX XX + _(0x0024) XX XX XX XX + _("L4T/U sport dport ") + _(tcpudp) XX XX XX XX _(padpad) __ __ __ __ _(padeth) __ __; + +#undef XX +#undef __ +#undef _ +#undef v + + static int count_skip (u8 * p, u32 size) +{ + u64 *p64 = (u64 *) p; + /* Be tolerant to null pointer */ + if (0 == p) + return 0; + + while ((0ULL == *p64) && ((u8 *) p64 - p) < size) + { + p64++; + } + return (p64 - (u64 *) p) / 2; +} + +static int +acl_classify_add_del_table_tiny (vnet_classify_main_t * cm, u8 * mask, + u32 mask_len, u32 next_table_index, + u32 miss_next_index, u32 * table_index, + int is_add) +{ + u32 nbuckets = 1; + u32 memory_size = 2 << 13; + u32 skip = count_skip (mask, mask_len); + u32 match = (mask_len / 16) - skip; + u8 *skip_mask_ptr = mask + 16 * skip; + u32 current_data_flag = 0; + int current_data_offset = 0; + + if (0 == match) + match = 1; + void *oldheap = clib_mem_set_heap (cm->vlib_main->heap_base); + int ret = vnet_classify_add_del_table (cm, skip_mask_ptr, nbuckets, + memory_size, skip, match, + next_table_index, miss_next_index, + table_index, current_data_flag, + current_data_offset, is_add, + 1 /* delete_chain */); + clib_mem_set_heap (oldheap); + return ret; +} + +static int +acl_classify_add_del_table_small (vnet_classify_main_t * cm, u8 * mask, + u32 mask_len, u32 next_table_index, + u32 miss_next_index, u32 * table_index, + int is_add) +{ + u32 nbuckets = 32; + u32 memory_size = 2 << 20; + u32 skip = count_skip (mask, mask_len); + u32 match = (mask_len / 16) - skip; + u8 *skip_mask_ptr = mask + 16 * skip; + u32 current_data_flag = 0; + int current_data_offset = 0; + + if (0 == match) + match = 1; + + void *oldheap = clib_mem_set_heap (cm->vlib_main->heap_base); + int ret = vnet_classify_add_del_table (cm, skip_mask_ptr, nbuckets, + memory_size, skip, match, + next_table_index, miss_next_index, + table_index, current_data_flag, + current_data_offset, is_add, + 1 /* delete_chain */); + clib_mem_set_heap (oldheap); + return ret; +} + + +static int +acl_unhook_l2_input_classify (acl_main_t * am, u32 sw_if_index) +{ + vnet_classify_main_t *cm = &vnet_classify_main; + u32 ip4_table_index = ~0; + u32 ip6_table_index = ~0; + void *oldheap = acl_set_heap(am); + + vec_validate_init_empty (am->acl_ip4_input_classify_table_by_sw_if_index, + sw_if_index, ~0); + vec_validate_init_empty (am->acl_ip6_input_classify_table_by_sw_if_index, + sw_if_index, ~0); + + /* switch to global heap while calling vnet_* functions */ + clib_mem_set_heap (cm->vlib_main->heap_base); + vnet_l2_input_classify_enable_disable (sw_if_index, 0); + + if (am->acl_ip4_input_classify_table_by_sw_if_index[sw_if_index] != ~0) + { + ip4_table_index = + am->acl_ip4_input_classify_table_by_sw_if_index[sw_if_index]; + am->acl_ip4_input_classify_table_by_sw_if_index[sw_if_index] = ~0; + acl_classify_add_del_table_tiny (cm, ip4_5tuple_mask, + sizeof (ip4_5tuple_mask) - 1, ~0, + am->l2_input_classify_next_acl_ip4, + &ip4_table_index, 0); + } + if (am->acl_ip6_input_classify_table_by_sw_if_index[sw_if_index] != ~0) + { + ip6_table_index = + am->acl_ip6_input_classify_table_by_sw_if_index[sw_if_index]; + am->acl_ip6_input_classify_table_by_sw_if_index[sw_if_index] = ~0; + acl_classify_add_del_table_tiny (cm, ip6_5tuple_mask, + sizeof (ip6_5tuple_mask) - 1, ~0, + am->l2_input_classify_next_acl_ip6, + &ip6_table_index, 0); + } + clib_mem_set_heap (oldheap); + return 0; +} + +static int +acl_unhook_l2_output_classify (acl_main_t * am, u32 sw_if_index) +{ + vnet_classify_main_t *cm = &vnet_classify_main; + u32 ip4_table_index = ~0; + u32 ip6_table_index = ~0; + void *oldheap = acl_set_heap(am); + + vec_validate_init_empty (am->acl_ip4_output_classify_table_by_sw_if_index, + sw_if_index, ~0); + vec_validate_init_empty (am->acl_ip6_output_classify_table_by_sw_if_index, + sw_if_index, ~0); + + /* switch to global heap while calling vnet_* functions */ + clib_mem_set_heap (cm->vlib_main->heap_base); + + vnet_l2_output_classify_enable_disable (sw_if_index, 0); + + if (am->acl_ip4_output_classify_table_by_sw_if_index[sw_if_index] != ~0) + { + ip4_table_index = + am->acl_ip4_output_classify_table_by_sw_if_index[sw_if_index]; + am->acl_ip4_output_classify_table_by_sw_if_index[sw_if_index] = ~0; + acl_classify_add_del_table_tiny (cm, ip4_5tuple_mask, + sizeof (ip4_5tuple_mask) - 1, ~0, + am->l2_output_classify_next_acl_ip4, + &ip4_table_index, 0); + } + if (am->acl_ip6_output_classify_table_by_sw_if_index[sw_if_index] != ~0) + { + ip6_table_index = + am->acl_ip6_output_classify_table_by_sw_if_index[sw_if_index]; + am->acl_ip6_output_classify_table_by_sw_if_index[sw_if_index] = ~0; + acl_classify_add_del_table_tiny (cm, ip6_5tuple_mask, + sizeof (ip6_5tuple_mask) - 1, ~0, + am->l2_output_classify_next_acl_ip6, + &ip6_table_index, 0); + } + clib_mem_set_heap (oldheap); + return 0; +} + +static int +acl_hook_l2_input_classify (acl_main_t * am, u32 sw_if_index) +{ + vnet_classify_main_t *cm = &vnet_classify_main; + u32 ip4_table_index = ~0; + u32 ip6_table_index = ~0; + int rv; + + void *prevheap = clib_mem_set_heap (cm->vlib_main->heap_base); + + /* in case there were previous tables attached */ + acl_unhook_l2_input_classify (am, sw_if_index); + rv = + acl_classify_add_del_table_tiny (cm, ip4_5tuple_mask, + sizeof (ip4_5tuple_mask) - 1, ~0, + am->l2_input_classify_next_acl_ip4, + &ip4_table_index, 1); + if (rv) + goto done; + rv = + acl_classify_add_del_table_tiny (cm, ip6_5tuple_mask, + sizeof (ip6_5tuple_mask) - 1, ~0, + am->l2_input_classify_next_acl_ip6, + &ip6_table_index, 1); + if (rv) + { + acl_classify_add_del_table_tiny (cm, ip4_5tuple_mask, + sizeof (ip4_5tuple_mask) - 1, ~0, + am->l2_input_classify_next_acl_ip4, + &ip4_table_index, 0); + goto done; + } + rv = + vnet_l2_input_classify_set_tables (sw_if_index, ip4_table_index, + ip6_table_index, ~0); + if (rv) + { + acl_classify_add_del_table_tiny (cm, ip6_5tuple_mask, + sizeof (ip6_5tuple_mask) - 1, ~0, + am->l2_input_classify_next_acl_ip6, + &ip6_table_index, 0); + acl_classify_add_del_table_tiny (cm, ip4_5tuple_mask, + sizeof (ip4_5tuple_mask) - 1, ~0, + am->l2_input_classify_next_acl_ip4, + &ip4_table_index, 0); + goto done; + } + + am->acl_ip4_input_classify_table_by_sw_if_index[sw_if_index] = + ip4_table_index; + am->acl_ip6_input_classify_table_by_sw_if_index[sw_if_index] = + ip6_table_index; + + vnet_l2_input_classify_enable_disable (sw_if_index, 1); +done: + clib_mem_set_heap (prevheap); + return rv; +} + +static int +acl_hook_l2_output_classify (acl_main_t * am, u32 sw_if_index) +{ + vnet_classify_main_t *cm = &vnet_classify_main; + u32 ip4_table_index = ~0; + u32 ip6_table_index = ~0; + int rv; + + void *prevheap = clib_mem_set_heap (cm->vlib_main->heap_base); + + /* in case there were previous tables attached */ + acl_unhook_l2_output_classify (am, sw_if_index); + rv = + acl_classify_add_del_table_tiny (cm, ip4_5tuple_mask, + sizeof (ip4_5tuple_mask) - 1, ~0, + am->l2_output_classify_next_acl_ip4, + &ip4_table_index, 1); + if (rv) + goto done; + rv = + acl_classify_add_del_table_tiny (cm, ip6_5tuple_mask, + sizeof (ip6_5tuple_mask) - 1, ~0, + am->l2_output_classify_next_acl_ip6, + &ip6_table_index, 1); + if (rv) + { + acl_classify_add_del_table_tiny (cm, ip4_5tuple_mask, + sizeof (ip4_5tuple_mask) - 1, ~0, + am->l2_output_classify_next_acl_ip4, + &ip4_table_index, 0); + goto done; + } + rv = + vnet_l2_output_classify_set_tables (sw_if_index, ip4_table_index, + ip6_table_index, ~0); + clib_warning + ("ACL enabling on interface sw_if_index %d, setting tables to the following: ip4: %d ip6: %d\n", + sw_if_index, ip4_table_index, ip6_table_index); + if (rv) + { + acl_classify_add_del_table_tiny (cm, ip6_5tuple_mask, + sizeof (ip6_5tuple_mask) - 1, ~0, + am->l2_output_classify_next_acl_ip6, + &ip6_table_index, 0); + acl_classify_add_del_table_tiny (cm, ip4_5tuple_mask, + sizeof (ip4_5tuple_mask) - 1, ~0, + am->l2_output_classify_next_acl_ip4, + &ip4_table_index, 0); + goto done; + } + + am->acl_ip4_output_classify_table_by_sw_if_index[sw_if_index] = + ip4_table_index; + am->acl_ip6_output_classify_table_by_sw_if_index[sw_if_index] = + ip6_table_index; + + vnet_l2_output_classify_enable_disable (sw_if_index, 1); +done: + clib_mem_set_heap (prevheap); + return rv; +} + + + +int +acl_interface_in_enable_disable (acl_main_t * am, u32 sw_if_index, + int enable_disable) +{ + int rv; + + /* Utterly wrong? */ + if (pool_is_free_index (am->vnet_main->interface_main.sw_interfaces, + sw_if_index)) + return VNET_API_ERROR_INVALID_SW_IF_INDEX; + + acl_fa_enable_disable(sw_if_index, 1, enable_disable); + + if (enable_disable) + { + rv = acl_hook_l2_input_classify (am, sw_if_index); + } + else + { + rv = acl_unhook_l2_input_classify (am, sw_if_index); + } + + return rv; +} + +int +acl_interface_out_enable_disable (acl_main_t * am, u32 sw_if_index, + int enable_disable) +{ + int rv; + + /* Utterly wrong? */ + if (pool_is_free_index (am->vnet_main->interface_main.sw_interfaces, + sw_if_index)) + return VNET_API_ERROR_INVALID_SW_IF_INDEX; + + acl_fa_enable_disable(sw_if_index, 0, enable_disable); + + if (enable_disable) + { + rv = acl_hook_l2_output_classify (am, sw_if_index); + } + else + { + rv = acl_unhook_l2_output_classify (am, sw_if_index); + } + + return rv; +} + +static int +acl_is_not_defined(acl_main_t *am, u32 acl_list_index) +{ + return (pool_is_free_index (am->acls, acl_list_index)); +} + + +static int +acl_interface_add_inout_acl (u32 sw_if_index, u8 is_input, u32 acl_list_index) +{ + acl_main_t *am = &acl_main; + if (acl_is_not_defined(am, acl_list_index)) { + /* ACL is not defined. Can not apply */ + return -1; + } + void *oldheap = acl_set_heap(am); + + if (is_input) + { + vec_validate (am->input_acl_vec_by_sw_if_index, sw_if_index); + + u32 index = vec_search(am->input_acl_vec_by_sw_if_index[sw_if_index], acl_list_index); + if (index < vec_len(am->input_acl_vec_by_sw_if_index[sw_if_index])) { + clib_warning("ACL %d is already applied inbound on sw_if_index %d (index %d)", + acl_list_index, sw_if_index, index); + /* the entry is already there */ + clib_mem_set_heap (oldheap); + return -1; + } + /* if there was no ACL applied before, enable the ACL processing */ + if (vec_len(am->input_acl_vec_by_sw_if_index[sw_if_index]) == 0) { + acl_interface_in_enable_disable (am, sw_if_index, 1); + } + vec_add (am->input_acl_vec_by_sw_if_index[sw_if_index], &acl_list_index, + 1); + vec_validate (am->input_sw_if_index_vec_by_acl, acl_list_index); + vec_add (am->input_sw_if_index_vec_by_acl[acl_list_index], &sw_if_index, + 1); + } + else + { + vec_validate (am->output_acl_vec_by_sw_if_index, sw_if_index); + + u32 index = vec_search(am->output_acl_vec_by_sw_if_index[sw_if_index], acl_list_index); + if (index < vec_len(am->output_acl_vec_by_sw_if_index[sw_if_index])) { + clib_warning("ACL %d is already applied outbound on sw_if_index %d (index %d)", + acl_list_index, sw_if_index, index); + /* the entry is already there */ + clib_mem_set_heap (oldheap); + return -1; + } + /* if there was no ACL applied before, enable the ACL processing */ + if (vec_len(am->output_acl_vec_by_sw_if_index[sw_if_index]) == 0) { + acl_interface_out_enable_disable (am, sw_if_index, 1); + } + vec_add (am->output_acl_vec_by_sw_if_index[sw_if_index], + &acl_list_index, 1); + vec_validate (am->output_sw_if_index_vec_by_acl, acl_list_index); + vec_add (am->output_sw_if_index_vec_by_acl[acl_list_index], &sw_if_index, + 1); + } + clib_mem_set_heap (oldheap); + return 0; +} + + +static int +acl_interface_del_inout_acl (u32 sw_if_index, u8 is_input, u32 acl_list_index) +{ + acl_main_t *am = &acl_main; + int i; + int rv = -1; + void *oldheap = acl_set_heap(am); + if (is_input) + { + vec_validate (am->input_acl_vec_by_sw_if_index, sw_if_index); + for (i = 0; i < vec_len (am->input_acl_vec_by_sw_if_index[sw_if_index]); + i++) + { + if (acl_list_index == + am->input_acl_vec_by_sw_if_index[sw_if_index][i]) + { + vec_del1 (am->input_acl_vec_by_sw_if_index[sw_if_index], i); + rv = 0; + break; + } + } + + if (acl_list_index < vec_len(am->input_sw_if_index_vec_by_acl)) { + u32 index = vec_search(am->input_sw_if_index_vec_by_acl[acl_list_index], sw_if_index); + if (index < vec_len(am->input_sw_if_index_vec_by_acl[acl_list_index])) { + hash_acl_unapply(am, sw_if_index, is_input, acl_list_index); + vec_del1 (am->input_sw_if_index_vec_by_acl[acl_list_index], index); + } + } + + /* If there is no more ACLs applied on an interface, disable ACL processing */ + if (0 == vec_len (am->input_acl_vec_by_sw_if_index[sw_if_index])) + { + acl_interface_in_enable_disable (am, sw_if_index, 0); + } + } + else + { + vec_validate (am->output_acl_vec_by_sw_if_index, sw_if_index); + for (i = 0; + i < vec_len (am->output_acl_vec_by_sw_if_index[sw_if_index]); i++) + { + if (acl_list_index == + am->output_acl_vec_by_sw_if_index[sw_if_index][i]) + { + vec_del1 (am->output_acl_vec_by_sw_if_index[sw_if_index], i); + rv = 0; + break; + } + } + + if (acl_list_index < vec_len(am->output_sw_if_index_vec_by_acl)) { + u32 index = vec_search(am->output_sw_if_index_vec_by_acl[acl_list_index], sw_if_index); + if (index < vec_len(am->output_sw_if_index_vec_by_acl[acl_list_index])) { + hash_acl_unapply(am, sw_if_index, is_input, acl_list_index); + vec_del1 (am->output_sw_if_index_vec_by_acl[acl_list_index], index); + } + } + + /* If there is no more ACLs applied on an interface, disable ACL processing */ + if (0 == vec_len (am->output_acl_vec_by_sw_if_index[sw_if_index])) + { + acl_interface_out_enable_disable (am, sw_if_index, 0); + } + } + clib_mem_set_heap (oldheap); + return rv; +} + +static void +acl_interface_reset_inout_acls (u32 sw_if_index, u8 is_input) +{ + acl_main_t *am = &acl_main; + int i; + void *oldheap = acl_set_heap(am); + if (is_input) + { + vec_validate (am->input_acl_vec_by_sw_if_index, sw_if_index); + if (vec_len(am->input_acl_vec_by_sw_if_index[sw_if_index]) > 0) { + acl_interface_in_enable_disable (am, sw_if_index, 0); + } + + for(i = vec_len(am->input_acl_vec_by_sw_if_index[sw_if_index])-1; i>=0; i--) { + u32 acl_list_index = am->input_acl_vec_by_sw_if_index[sw_if_index][i]; + hash_acl_unapply(am, sw_if_index, is_input, acl_list_index); + if (acl_list_index < vec_len(am->input_sw_if_index_vec_by_acl)) { + u32 index = vec_search(am->input_sw_if_index_vec_by_acl[acl_list_index], sw_if_index); + if (index < vec_len(am->input_sw_if_index_vec_by_acl[acl_list_index])) { + vec_del1 (am->input_sw_if_index_vec_by_acl[acl_list_index], index); + } + } + } + + vec_reset_length (am->input_acl_vec_by_sw_if_index[sw_if_index]); + } + else + { + vec_validate (am->output_acl_vec_by_sw_if_index, sw_if_index); + if (vec_len(am->output_acl_vec_by_sw_if_index[sw_if_index]) > 0) { + acl_interface_out_enable_disable (am, sw_if_index, 0); + } + + for(i = vec_len(am->output_acl_vec_by_sw_if_index[sw_if_index])-1; i>=0; i--) { + u32 acl_list_index = am->output_acl_vec_by_sw_if_index[sw_if_index][i]; + hash_acl_unapply(am, sw_if_index, is_input, acl_list_index); + if (acl_list_index < vec_len(am->output_sw_if_index_vec_by_acl)) { + u32 index = vec_search(am->output_sw_if_index_vec_by_acl[acl_list_index], sw_if_index); + if (index < vec_len(am->output_sw_if_index_vec_by_acl[acl_list_index])) { + vec_del1 (am->output_sw_if_index_vec_by_acl[acl_list_index], index); + } + } + } + + vec_reset_length (am->output_acl_vec_by_sw_if_index[sw_if_index]); + } + clib_mem_set_heap (oldheap); +} + +static int +acl_interface_add_del_inout_acl (u32 sw_if_index, u8 is_add, u8 is_input, + u32 acl_list_index) +{ + int rv = -1; + acl_main_t *am = &acl_main; + if (is_add) + { + rv = + acl_interface_add_inout_acl (sw_if_index, is_input, acl_list_index); + if (rv == 0) + { + hash_acl_apply(am, sw_if_index, is_input, acl_list_index); + } + } + else + { + hash_acl_unapply(am, sw_if_index, is_input, acl_list_index); + rv = + acl_interface_del_inout_acl (sw_if_index, is_input, acl_list_index); + } + return rv; +} + + +typedef struct +{ + u8 is_ipv6; + u8 mac_mask[6]; + u8 prefix_len; + u32 count; + u32 table_index; + u32 arp_table_index; +} macip_match_type_t; + +static u32 +macip_find_match_type (macip_match_type_t * mv, u8 * mac_mask, u8 prefix_len, + u8 is_ipv6) +{ + u32 i; + if (mv) + { + for (i = 0; i < vec_len (mv); i++) + { + if ((mv[i].prefix_len == prefix_len) && (mv[i].is_ipv6 == is_ipv6) + && (0 == memcmp (mv[i].mac_mask, mac_mask, 6))) + { + return i; + } + } + } + return ~0; +} + + +/* Get metric used to sort match types. + The more specific and the more often seen - the bigger the metric */ +static int +match_type_metric (macip_match_type_t * m) +{ + unsigned int mac_bits_set = 0; + unsigned int mac_byte; + int i; + for (i=0; i<6; i++) + { + mac_byte = m->mac_mask[i]; + for (; mac_byte; mac_byte >>= 1) + mac_bits_set += mac_byte & 1; + } + /* + * Attempt to place the more specific and the more used rules on top. + * There are obvious caveat corner cases to this, but they do not + * seem to be sensible in real world (e.g. specific IPv4 with wildcard MAC + * going with a wildcard IPv4 with a specific MAC). + */ + return m->prefix_len + mac_bits_set + m->is_ipv6 + 10 * m->count; +} + +static int +match_type_compare (macip_match_type_t * m1, macip_match_type_t * m2) +{ + /* Ascending sort based on the metric values */ + return match_type_metric (m1) - match_type_metric (m2); +} + +/* Get the offset of L3 source within ethernet packet */ +static int +get_l3_src_offset(int is6) +{ + if(is6) + return (sizeof(ethernet_header_t) + offsetof(ip6_header_t, src_address)); + else + return (sizeof(ethernet_header_t) + offsetof(ip4_header_t, src_address)); +} + +static int +macip_create_classify_tables (acl_main_t * am, u32 macip_acl_index) +{ + macip_match_type_t *mvec = NULL; + macip_match_type_t *mt; + macip_acl_list_t *a = pool_elt_at_index (am->macip_acls, macip_acl_index); + int i; + u32 match_type_index; + u32 last_table; + u8 mask[5 * 16]; + vnet_classify_main_t *cm = &vnet_classify_main; + + /* Count the number of different types of rules */ + for (i = 0; i < a->count; i++) + { + if (~0 == + (match_type_index = + macip_find_match_type (mvec, a->rules[i].src_mac_mask, + a->rules[i].src_prefixlen, + a->rules[i].is_ipv6))) + { + match_type_index = vec_len (mvec); + vec_validate (mvec, match_type_index); + memcpy (mvec[match_type_index].mac_mask, + a->rules[i].src_mac_mask, 6); + mvec[match_type_index].prefix_len = a->rules[i].src_prefixlen; + mvec[match_type_index].is_ipv6 = a->rules[i].is_ipv6; + mvec[match_type_index].table_index = ~0; + } + mvec[match_type_index].count++; + } + /* Put the most frequently used tables last in the list so we can create classifier tables in reverse order */ + vec_sort_with_function (mvec, match_type_compare); + /* Create the classifier tables */ + last_table = ~0; + /* First add ARP tables */ + vec_foreach (mt, mvec) + { + int mask_len; + int is6 = mt->is_ipv6; + + mt->arp_table_index = ~0; + if (!is6) + { + memset (mask, 0, sizeof (mask)); + memcpy (&mask[6], mt->mac_mask, 6); + memset (&mask[12], 0xff, 2); /* ethernet protocol */ + memcpy (&mask[14 + 8], mt->mac_mask, 6); + + for (i = 0; i < (mt->prefix_len / 8); i++) + mask[14 + 14 + i] = 0xff; + if (mt->prefix_len % 8) + mask[14 + 14 + (mt->prefix_len / 8)] = 0xff - ((1 << (8 - mt->prefix_len % 8)) - 1); + + mask_len = ((14 + 14 + ((mt->prefix_len+7) / 8) + + (sizeof (u32x4)-1))/sizeof(u32x4)) * sizeof (u32x4); + acl_classify_add_del_table_small (cm, mask, mask_len, last_table, + (~0 == last_table) ? 0 : ~0, &mt->arp_table_index, + 1); + last_table = mt->arp_table_index; + } + } + /* Now add IP[46] tables */ + vec_foreach (mt, mvec) + { + int mask_len; + int is6 = mt->is_ipv6; + int l3_src_offs = get_l3_src_offset(is6); + memset (mask, 0, sizeof (mask)); + memcpy (&mask[6], mt->mac_mask, 6); + for (i = 0; i < (mt->prefix_len / 8); i++) + { + mask[l3_src_offs + i] = 0xff; + } + if (mt->prefix_len % 8) + { + mask[l3_src_offs + (mt->prefix_len / 8)] = + 0xff - ((1 << (8 - mt->prefix_len % 8)) - 1); + } + /* + * Round-up the number of bytes needed to store the prefix, + * and round up the number of vectors too + */ + mask_len = ((l3_src_offs + ((mt->prefix_len+7) / 8) + + (sizeof (u32x4)-1))/sizeof(u32x4)) * sizeof (u32x4); + acl_classify_add_del_table_small (cm, mask, mask_len, last_table, + (~0 == last_table) ? 0 : ~0, &mt->table_index, + 1); + last_table = mt->table_index; + } + a->ip4_table_index = last_table; + a->ip6_table_index = last_table; + a->l2_table_index = last_table; + + /* Populate the classifier tables with rules from the MACIP ACL */ + for (i = 0; i < a->count; i++) + { + u32 action = 0; + u32 metadata = 0; + int is6 = a->rules[i].is_ipv6; + int l3_src_offs = get_l3_src_offset(is6); + memset (mask, 0, sizeof (mask)); + memcpy (&mask[6], a->rules[i].src_mac, 6); + memset (&mask[12], 0xff, 2); /* ethernet protocol */ + if (is6) + { + memcpy (&mask[l3_src_offs], &a->rules[i].src_ip_addr.ip6, 16); + mask[12] = 0x86; + mask[13] = 0xdd; + } + else + { + memcpy (&mask[l3_src_offs], &a->rules[i].src_ip_addr.ip4, 4); + mask[12] = 0x08; + mask[13] = 0x00; + } + match_type_index = + macip_find_match_type (mvec, a->rules[i].src_mac_mask, + a->rules[i].src_prefixlen, + a->rules[i].is_ipv6); + ASSERT(match_type_index != ~0); + /* add session to table mvec[match_type_index].table_index; */ + vnet_classify_add_del_session (cm, mvec[match_type_index].table_index, + mask, a->rules[i].is_permit ? ~0 : 0, i, + 0, action, metadata, 1); + /* add ARP table entry too */ + if (!is6 && (mvec[match_type_index].arp_table_index != ~0)) + { + memset (mask, 0, sizeof (mask)); + memcpy (&mask[6], a->rules[i].src_mac, 6); + mask[12] = 0x08; + mask[13] = 0x06; + memcpy (&mask[14 + 8], a->rules[i].src_mac, 6); + memcpy (&mask[14 + 14], &a->rules[i].src_ip_addr.ip4, 4); + vnet_classify_add_del_session (cm, mvec[match_type_index].arp_table_index, + mask, a->rules[i].is_permit ? ~0 : 0, i, + 0, action, metadata, 1); + } + } + return 0; +} + +static void +macip_destroy_classify_tables (acl_main_t * am, u32 macip_acl_index) +{ + vnet_classify_main_t *cm = &vnet_classify_main; + macip_acl_list_t *a = pool_elt_at_index (am->macip_acls, macip_acl_index); + + if (a->ip4_table_index != ~0) + { + acl_classify_add_del_table_small (cm, 0, ~0, ~0, ~0, &a->ip4_table_index, 0); + a->ip4_table_index = ~0; + } + if (a->ip6_table_index != ~0) + { + acl_classify_add_del_table_small (cm, 0, ~0, ~0, ~0, &a->ip6_table_index, 0); + a->ip6_table_index = ~0; + } + if (a->l2_table_index != ~0) + { + acl_classify_add_del_table_small (cm, 0, ~0, ~0, ~0, &a->l2_table_index, 0); + a->l2_table_index = ~0; + } +} + +static int +macip_acl_add_list (u32 count, vl_api_macip_acl_rule_t rules[], + u32 * acl_list_index, u8 * tag) +{ + acl_main_t *am = &acl_main; + macip_acl_list_t *a; + macip_acl_rule_t *r; + macip_acl_rule_t *acl_new_rules = 0; + int i; + + if (*acl_list_index != ~0) + { + /* They supplied some number, let's see if this MACIP ACL exists */ + if (pool_is_free_index (am->macip_acls, *acl_list_index)) + { + /* tried to replace a non-existent ACL, no point doing anything */ + clib_warning("acl-plugin-error: Trying to replace nonexistent MACIP ACL %d (tag %s)", *acl_list_index, tag); + return -1; + } + } + + if (0 == count) { + clib_warning("acl-plugin-warning: Trying to create empty MACIP ACL (tag %s)", tag); + } + void *oldheap = acl_set_heap(am); + /* Create and populate the rules */ + if (count > 0) + vec_validate(acl_new_rules, count-1); + + for (i = 0; i < count; i++) + { + r = &acl_new_rules[i]; + r->is_permit = rules[i].is_permit; + r->is_ipv6 = rules[i].is_ipv6; + memcpy (&r->src_mac, rules[i].src_mac, 6); + memcpy (&r->src_mac_mask, rules[i].src_mac_mask, 6); + if(rules[i].is_ipv6) + memcpy (&r->src_ip_addr.ip6, rules[i].src_ip_addr, 16); + else + memcpy (&r->src_ip_addr.ip4, rules[i].src_ip_addr, 4); + r->src_prefixlen = rules[i].src_ip_prefix_len; + } + + if (~0 == *acl_list_index) + { + /* Get ACL index */ + pool_get_aligned (am->macip_acls, a, CLIB_CACHE_LINE_BYTES); + memset (a, 0, sizeof (*a)); + /* Will return the newly allocated ACL index */ + *acl_list_index = a - am->macip_acls; + } + else + { + a = pool_elt_at_index (am->macip_acls, *acl_list_index); + if (a->rules) + { + vec_free (a->rules); + } + macip_destroy_classify_tables (am, *acl_list_index); + } + + a->rules = acl_new_rules; + a->count = count; + memcpy (a->tag, tag, sizeof (a->tag)); + + /* Create and populate the classifer tables */ + macip_create_classify_tables (am, *acl_list_index); + clib_mem_set_heap (oldheap); + return 0; +} + + +/* No check for validity of sw_if_index - the callers were supposed to validate */ + +static int +macip_acl_interface_del_acl (acl_main_t * am, u32 sw_if_index) +{ + int rv; + u32 macip_acl_index; + macip_acl_list_t *a; + void *oldheap = acl_set_heap(am); + vec_validate_init_empty (am->macip_acl_by_sw_if_index, sw_if_index, ~0); + clib_mem_set_heap (oldheap); + macip_acl_index = am->macip_acl_by_sw_if_index[sw_if_index]; + /* No point in deleting MACIP ACL which is not applied */ + if (~0 == macip_acl_index) + return -1; + a = pool_elt_at_index (am->macip_acls, macip_acl_index); + /* remove the classifier tables off the interface L2 ACL */ + rv = + vnet_set_input_acl_intfc (am->vlib_main, sw_if_index, a->ip4_table_index, + a->ip6_table_index, a->l2_table_index, 0); + /* Unset the MACIP ACL index */ + am->macip_acl_by_sw_if_index[sw_if_index] = ~0; + return rv; +} + +/* No check for validity of sw_if_index - the callers were supposed to validate */ + +static int +macip_acl_interface_add_acl (acl_main_t * am, u32 sw_if_index, + u32 macip_acl_index) +{ + macip_acl_list_t *a; + int rv; + if (pool_is_free_index (am->macip_acls, macip_acl_index)) + { + return -1; + } + void *oldheap = acl_set_heap(am); + a = pool_elt_at_index (am->macip_acls, macip_acl_index); + vec_validate_init_empty (am->macip_acl_by_sw_if_index, sw_if_index, ~0); + clib_mem_set_heap (oldheap); + /* If there already a MACIP ACL applied, unapply it */ + if (~0 != am->macip_acl_by_sw_if_index[sw_if_index]) + macip_acl_interface_del_acl(am, sw_if_index); + am->macip_acl_by_sw_if_index[sw_if_index] = macip_acl_index; + + /* Apply the classifier tables for L2 ACLs */ + rv = + vnet_set_input_acl_intfc (am->vlib_main, sw_if_index, a->ip4_table_index, + a->ip6_table_index, a->l2_table_index, 1); + return rv; +} + +static int +macip_acl_del_list (u32 acl_list_index) +{ + acl_main_t *am = &acl_main; + macip_acl_list_t *a; + int i; + if (pool_is_free_index (am->macip_acls, acl_list_index)) + { + return -1; + } + + /* delete any references to the ACL */ + for (i = 0; i < vec_len (am->macip_acl_by_sw_if_index); i++) + { + if (am->macip_acl_by_sw_if_index[i] == acl_list_index) + { + macip_acl_interface_del_acl (am, i); + } + } + + void *oldheap = acl_set_heap(am); + /* Now that classifier tables are detached, clean them up */ + macip_destroy_classify_tables (am, acl_list_index); + + /* now we can delete the ACL itself */ + a = pool_elt_at_index (am->macip_acls, acl_list_index); + if (a->rules) + { + vec_free (a->rules); + } + pool_put (am->macip_acls, a); + clib_mem_set_heap (oldheap); + return 0; +} + + +static int +macip_acl_interface_add_del_acl (u32 sw_if_index, u8 is_add, + u32 acl_list_index) +{ + acl_main_t *am = &acl_main; + int rv = -1; + if (is_add) + { + rv = macip_acl_interface_add_acl (am, sw_if_index, acl_list_index); + } + else + { + rv = macip_acl_interface_del_acl (am, sw_if_index); + } + return rv; +} + +/* + * If the client does not allocate enough memory for a variable-length + * message, and then proceed to use it as if the full memory allocated, + * absent the check we happily consume that on the VPP side, and go + * along as if nothing happened. However, the resulting + * effects range from just garbage in the API decode + * (because the decoder snoops too far), to potential memory + * corruptions. + * + * This verifies that the actual length of the message is + * at least expected_len, and complains loudly if it is not. + * + * A failing check here is 100% a software bug on the API user side, + * so we might as well yell. + * + */ +static int verify_message_len(void *mp, u32 expected_len, char *where) +{ + u32 supplied_len = vl_msg_api_get_msg_length (mp); + if (supplied_len < expected_len) { + clib_warning("%s: Supplied message length %d is less than expected %d", + where, supplied_len, expected_len); + return 0; + } else { + return 1; + } +} + +/* API message handler */ +static void +vl_api_acl_add_replace_t_handler (vl_api_acl_add_replace_t * mp) +{ + vl_api_acl_add_replace_reply_t *rmp; + acl_main_t *am = &acl_main; + int rv; + u32 acl_list_index = ntohl (mp->acl_index); + u32 acl_count = ntohl (mp->count); + u32 expected_len = sizeof(*mp) + acl_count*sizeof(mp->r[0]); + + if (verify_message_len(mp, expected_len, "acl_add_replace")) { + rv = acl_add_list (acl_count, mp->r, &acl_list_index, mp->tag); + } else { + rv = VNET_API_ERROR_INVALID_VALUE; + } + + /* *INDENT-OFF* */ + REPLY_MACRO2(VL_API_ACL_ADD_REPLACE_REPLY, + ({ + rmp->acl_index = htonl(acl_list_index); + })); + /* *INDENT-ON* */ +} + +static void +vl_api_acl_del_t_handler (vl_api_acl_del_t * mp) +{ + acl_main_t *am = &acl_main; + vl_api_acl_del_reply_t *rmp; + int rv; + + rv = acl_del_list (ntohl (mp->acl_index)); + + REPLY_MACRO (VL_API_ACL_DEL_REPLY); +} + +static void +vl_api_acl_interface_add_del_t_handler (vl_api_acl_interface_add_del_t * mp) +{ + acl_main_t *am = &acl_main; + vnet_interface_main_t *im = &am->vnet_main->interface_main; + u32 sw_if_index = ntohl (mp->sw_if_index); + vl_api_acl_interface_add_del_reply_t *rmp; + int rv = -1; + + if (pool_is_free_index(im->sw_interfaces, sw_if_index)) + rv = VNET_API_ERROR_INVALID_SW_IF_INDEX; + else + rv = + acl_interface_add_del_inout_acl (sw_if_index, mp->is_add, + mp->is_input, ntohl (mp->acl_index)); + + REPLY_MACRO (VL_API_ACL_INTERFACE_ADD_DEL_REPLY); +} + +static void +vl_api_acl_interface_set_acl_list_t_handler + (vl_api_acl_interface_set_acl_list_t * mp) +{ + acl_main_t *am = &acl_main; + vl_api_acl_interface_set_acl_list_reply_t *rmp; + int rv = 0; + int i; + vnet_interface_main_t *im = &am->vnet_main->interface_main; + u32 sw_if_index = ntohl (mp->sw_if_index); + + if (pool_is_free_index(im->sw_interfaces, sw_if_index)) + rv = VNET_API_ERROR_INVALID_SW_IF_INDEX; + else + { + acl_interface_reset_inout_acls (sw_if_index, 0); + acl_interface_reset_inout_acls (sw_if_index, 1); + + for (i = 0; i < mp->count; i++) + { + if(acl_is_not_defined(am, ntohl (mp->acls[i]))) { + /* ACL does not exist, so we can not apply it */ + rv = -1; + } + } + if (0 == rv) { + for (i = 0; i < mp->count; i++) + { + acl_interface_add_del_inout_acl (sw_if_index, 1, (i < mp->n_input), + ntohl (mp->acls[i])); + } + } + } + + REPLY_MACRO (VL_API_ACL_INTERFACE_SET_ACL_LIST_REPLY); +} + +static void +copy_acl_rule_to_api_rule (vl_api_acl_rule_t * api_rule, acl_rule_t * r) +{ + api_rule->is_permit = r->is_permit; + api_rule->is_ipv6 = r->is_ipv6; + if(r->is_ipv6) + { + memcpy (api_rule->src_ip_addr, &r->src, sizeof (r->src)); + memcpy (api_rule->dst_ip_addr, &r->dst, sizeof (r->dst)); + } + else + { + memcpy (api_rule->src_ip_addr, &r->src.ip4, sizeof (r->src.ip4)); + memcpy (api_rule->dst_ip_addr, &r->dst.ip4, sizeof (r->dst.ip4)); + } + api_rule->src_ip_prefix_len = r->src_prefixlen; + api_rule->dst_ip_prefix_len = r->dst_prefixlen; + api_rule->proto = r->proto; + api_rule->srcport_or_icmptype_first = htons (r->src_port_or_type_first); + api_rule->srcport_or_icmptype_last = htons (r->src_port_or_type_last); + api_rule->dstport_or_icmpcode_first = htons (r->dst_port_or_code_first); + api_rule->dstport_or_icmpcode_last = htons (r->dst_port_or_code_last); + api_rule->tcp_flags_mask = r->tcp_flags_mask; + api_rule->tcp_flags_value = r->tcp_flags_value; +} + +static void +send_acl_details (acl_main_t * am, unix_shared_memory_queue_t * q, + acl_list_t * acl, u32 context) +{ + vl_api_acl_details_t *mp; + vl_api_acl_rule_t *rules; + int i; + int msg_size = sizeof (*mp) + sizeof (mp->r[0]) * acl->count; + void *oldheap = acl_set_heap(am); + + mp = vl_msg_api_alloc (msg_size); + memset (mp, 0, msg_size); + mp->_vl_msg_id = ntohs (VL_API_ACL_DETAILS + am->msg_id_base); + + /* fill in the message */ + mp->context = context; + mp->count = htonl (acl->count); + mp->acl_index = htonl (acl - am->acls); + memcpy (mp->tag, acl->tag, sizeof (mp->tag)); + // clib_memcpy (mp->r, acl->rules, acl->count * sizeof(acl->rules[0])); + rules = mp->r; + for (i = 0; i < acl->count; i++) + { + copy_acl_rule_to_api_rule (&rules[i], &acl->rules[i]); + } + + clib_mem_set_heap (oldheap); + vl_msg_api_send_shmem (q, (u8 *) & mp); +} + + +static void +vl_api_acl_dump_t_handler (vl_api_acl_dump_t * mp) +{ + acl_main_t *am = &acl_main; + u32 acl_index; + acl_list_t *acl; + + int rv = -1; + unix_shared_memory_queue_t *q; + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (q == 0) + { + return; + } + + if (mp->acl_index == ~0) + { + /* *INDENT-OFF* */ + /* Just dump all ACLs */ + pool_foreach (acl, am->acls, + ({ + send_acl_details(am, q, acl, mp->context); + })); + /* *INDENT-ON* */ + } + else + { + acl_index = ntohl (mp->acl_index); + if (!pool_is_free_index (am->acls, acl_index)) + { + acl = pool_elt_at_index (am->acls, acl_index); + send_acl_details (am, q, acl, mp->context); + } + } + + if (rv == -1) + { + /* FIXME API: should we signal an error here at all ? */ + return; + } +} + +static void +send_acl_interface_list_details (acl_main_t * am, + unix_shared_memory_queue_t * q, + u32 sw_if_index, u32 context) +{ + vl_api_acl_interface_list_details_t *mp; + int msg_size; + int n_input; + int n_output; + int count; + int i = 0; + void *oldheap = acl_set_heap(am); + + vec_validate (am->input_acl_vec_by_sw_if_index, sw_if_index); + vec_validate (am->output_acl_vec_by_sw_if_index, sw_if_index); + + n_input = vec_len (am->input_acl_vec_by_sw_if_index[sw_if_index]); + n_output = vec_len (am->output_acl_vec_by_sw_if_index[sw_if_index]); + count = n_input + n_output; + + msg_size = sizeof (*mp); + msg_size += sizeof (mp->acls[0]) * count; + + mp = vl_msg_api_alloc (msg_size); + memset (mp, 0, msg_size); + mp->_vl_msg_id = + ntohs (VL_API_ACL_INTERFACE_LIST_DETAILS + am->msg_id_base); + + /* fill in the message */ + mp->context = context; + mp->sw_if_index = htonl (sw_if_index); + mp->count = count; + mp->n_input = n_input; + for (i = 0; i < n_input; i++) + { + mp->acls[i] = htonl (am->input_acl_vec_by_sw_if_index[sw_if_index][i]); + } + for (i = 0; i < n_output; i++) + { + mp->acls[n_input + i] = + htonl (am->output_acl_vec_by_sw_if_index[sw_if_index][i]); + } + clib_mem_set_heap (oldheap); + vl_msg_api_send_shmem (q, (u8 *) & mp); +} + +static void +vl_api_acl_interface_list_dump_t_handler (vl_api_acl_interface_list_dump_t * + mp) +{ + acl_main_t *am = &acl_main; + vnet_sw_interface_t *swif; + vnet_interface_main_t *im = &am->vnet_main->interface_main; + + u32 sw_if_index; + unix_shared_memory_queue_t *q; + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (q == 0) + { + return; + } + + if (mp->sw_if_index == ~0) + { + /* *INDENT-OFF* */ + pool_foreach (swif, im->sw_interfaces, + ({ + send_acl_interface_list_details(am, q, swif->sw_if_index, mp->context); + })); + /* *INDENT-ON* */ + } + else + { + sw_if_index = ntohl (mp->sw_if_index); + if (!pool_is_free_index(im->sw_interfaces, sw_if_index)) + send_acl_interface_list_details (am, q, sw_if_index, mp->context); + } +} + +/* MACIP ACL API handlers */ + +static void +vl_api_macip_acl_add_t_handler (vl_api_macip_acl_add_t * mp) +{ + vl_api_macip_acl_add_reply_t *rmp; + acl_main_t *am = &acl_main; + int rv; + u32 acl_list_index = ~0; + u32 acl_count = ntohl (mp->count); + u32 expected_len = sizeof(*mp) + acl_count*sizeof(mp->r[0]); + + if (verify_message_len(mp, expected_len, "macip_acl_add")) { + rv = macip_acl_add_list (acl_count, mp->r, &acl_list_index, mp->tag); + } else { + rv = VNET_API_ERROR_INVALID_VALUE; + } + + /* *INDENT-OFF* */ + REPLY_MACRO2(VL_API_MACIP_ACL_ADD_REPLY, + ({ + rmp->acl_index = htonl(acl_list_index); + })); + /* *INDENT-ON* */ +} + +static void +vl_api_macip_acl_add_replace_t_handler (vl_api_macip_acl_add_replace_t * mp) +{ + vl_api_macip_acl_add_replace_reply_t *rmp; + acl_main_t *am = &acl_main; + int rv; + u32 acl_list_index = ntohl (mp->acl_index); + u32 acl_count = ntohl (mp->count); + u32 expected_len = sizeof(*mp) + acl_count*sizeof(mp->r[0]); + + if (verify_message_len(mp, expected_len, "macip_acl_add_replace")) { + rv = macip_acl_add_list (acl_count, mp->r, &acl_list_index, mp->tag); + } else { + rv = VNET_API_ERROR_INVALID_VALUE; + } + + /* *INDENT-OFF* */ + REPLY_MACRO2(VL_API_MACIP_ACL_ADD_REPLACE_REPLY, + ({ + rmp->acl_index = htonl(acl_list_index); + })); + /* *INDENT-ON* */ +} + +static void +vl_api_macip_acl_del_t_handler (vl_api_macip_acl_del_t * mp) +{ + acl_main_t *am = &acl_main; + vl_api_macip_acl_del_reply_t *rmp; + int rv; + + rv = macip_acl_del_list (ntohl (mp->acl_index)); + + REPLY_MACRO (VL_API_MACIP_ACL_DEL_REPLY); +} + +static void +vl_api_macip_acl_interface_add_del_t_handler + (vl_api_macip_acl_interface_add_del_t * mp) +{ + acl_main_t *am = &acl_main; + vl_api_macip_acl_interface_add_del_reply_t *rmp; + int rv = -1; + vnet_interface_main_t *im = &am->vnet_main->interface_main; + u32 sw_if_index = ntohl (mp->sw_if_index); + + if (pool_is_free_index(im->sw_interfaces, sw_if_index)) + rv = VNET_API_ERROR_INVALID_SW_IF_INDEX; + else + rv = + macip_acl_interface_add_del_acl (ntohl (mp->sw_if_index), mp->is_add, + ntohl (mp->acl_index)); + + REPLY_MACRO (VL_API_MACIP_ACL_INTERFACE_ADD_DEL_REPLY); +} + +static void +send_macip_acl_details (acl_main_t * am, unix_shared_memory_queue_t * q, + macip_acl_list_t * acl, u32 context) +{ + vl_api_macip_acl_details_t *mp; + vl_api_macip_acl_rule_t *rules; + macip_acl_rule_t *r; + int i; + int msg_size = sizeof (*mp) + (acl ? sizeof (mp->r[0]) * acl->count : 0); + + mp = vl_msg_api_alloc (msg_size); + memset (mp, 0, msg_size); + mp->_vl_msg_id = ntohs (VL_API_MACIP_ACL_DETAILS + am->msg_id_base); + + /* fill in the message */ + mp->context = context; + if (acl) + { + memcpy (mp->tag, acl->tag, sizeof (mp->tag)); + mp->count = htonl (acl->count); + mp->acl_index = htonl (acl - am->macip_acls); + rules = mp->r; + for (i = 0; i < acl->count; i++) + { + r = &acl->rules[i]; + rules[i].is_permit = r->is_permit; + rules[i].is_ipv6 = r->is_ipv6; + memcpy (rules[i].src_mac, &r->src_mac, sizeof (r->src_mac)); + memcpy (rules[i].src_mac_mask, &r->src_mac_mask, + sizeof (r->src_mac_mask)); + if (r->is_ipv6) + memcpy (rules[i].src_ip_addr, &r->src_ip_addr.ip6, + sizeof (r->src_ip_addr.ip6)); + else + memcpy (rules[i].src_ip_addr, &r->src_ip_addr.ip4, + sizeof (r->src_ip_addr.ip4)); + rules[i].src_ip_prefix_len = r->src_prefixlen; + } + } + else + { + /* No martini, no party - no ACL applied to this interface. */ + mp->acl_index = ~0; + mp->count = 0; + } + + vl_msg_api_send_shmem (q, (u8 *) & mp); +} + + +static void +vl_api_macip_acl_dump_t_handler (vl_api_macip_acl_dump_t * mp) +{ + acl_main_t *am = &acl_main; + macip_acl_list_t *acl; + + unix_shared_memory_queue_t *q; + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (q == 0) + { + return; + } + + if (mp->acl_index == ~0) + { + /* Just dump all ACLs for now, with sw_if_index = ~0 */ + pool_foreach (acl, am->macip_acls, ( + { + send_macip_acl_details (am, q, acl, + mp-> + context);} + )); + /* *INDENT-ON* */ + } + else + { + u32 acl_index = ntohl (mp->acl_index); + if (!pool_is_free_index (am->macip_acls, acl_index)) + { + acl = pool_elt_at_index (am->macip_acls, acl_index); + send_macip_acl_details (am, q, acl, mp->context); + } + } +} + +static void +vl_api_macip_acl_interface_get_t_handler (vl_api_macip_acl_interface_get_t * + mp) +{ + acl_main_t *am = &acl_main; + vl_api_macip_acl_interface_get_reply_t *rmp; + u32 count = vec_len (am->macip_acl_by_sw_if_index); + int msg_size = sizeof (*rmp) + sizeof (rmp->acls[0]) * count; + unix_shared_memory_queue_t *q; + int i; + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (q == 0) + { + return; + } + + rmp = vl_msg_api_alloc (msg_size); + memset (rmp, 0, msg_size); + rmp->_vl_msg_id = + ntohs (VL_API_MACIP_ACL_INTERFACE_GET_REPLY + am->msg_id_base); + rmp->context = mp->context; + rmp->count = htonl (count); + for (i = 0; i < count; i++) + { + rmp->acls[i] = htonl (am->macip_acl_by_sw_if_index[i]); + } + + vl_msg_api_send_shmem (q, (u8 *) & rmp); +} + +static void +send_macip_acl_interface_list_details (acl_main_t * am, + unix_shared_memory_queue_t * q, + u32 sw_if_index, + u32 acl_index, + u32 context) +{ + vl_api_macip_acl_interface_list_details_t *rmp; + /* at this time there is only ever 1 mac ip acl per interface */ + int msg_size = sizeof (*rmp) + sizeof (rmp->acls[0]); + + rmp = vl_msg_api_alloc (msg_size); + memset (rmp, 0, msg_size); + rmp->_vl_msg_id = ntohs (VL_API_MACIP_ACL_INTERFACE_LIST_DETAILS + am->msg_id_base); + + /* fill in the message */ + rmp->context = context; + rmp->count = 1; + rmp->sw_if_index = htonl (sw_if_index); + rmp->acls[0] = htonl (acl_index); + + vl_msg_api_send_shmem (q, (u8 *) & rmp); +} + +static void +vl_api_macip_acl_interface_list_dump_t_handler (vl_api_macip_acl_interface_list_dump_t *mp) +{ + unix_shared_memory_queue_t *q; + acl_main_t *am = &acl_main; + u32 sw_if_index = ntohl (mp->sw_if_index); + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (q == 0) + { + return; + } + + if (sw_if_index == ~0) + { + vec_foreach_index(sw_if_index, am->macip_acl_by_sw_if_index) + { + if (~0 != am->macip_acl_by_sw_if_index[sw_if_index]) + { + send_macip_acl_interface_list_details(am, q, sw_if_index, + am->macip_acl_by_sw_if_index[sw_if_index], + mp->context); + } + } + } + else + { + if (vec_len(am->macip_acl_by_sw_if_index) > sw_if_index) + { + send_macip_acl_interface_list_details(am, q, sw_if_index, + am->macip_acl_by_sw_if_index[sw_if_index], + mp->context); + } + } +} + +/* Set up the API message handling tables */ +static clib_error_t * +acl_plugin_api_hookup (vlib_main_t * vm) +{ + acl_main_t *am = &acl_main; +#define _(N,n) \ + vl_msg_api_set_handlers((VL_API_##N + am->msg_id_base), \ + #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_acl_plugin_api_msg; +#undef _ + + return 0; +} + +#define vl_msg_name_crc_list +#include <acl/acl_all_api_h.h> +#undef vl_msg_name_crc_list + +static void +setup_message_id_table (acl_main_t * am, api_main_t * apim) +{ +#define _(id,n,crc) \ + vl_msg_api_add_msg_name_crc (apim, #n "_" #crc, id + am->msg_id_base); + foreach_vl_msg_name_crc_acl; +#undef _ +} + +static void +acl_setup_fa_nodes (void) +{ + vlib_main_t *vm = vlib_get_main (); + acl_main_t *am = &acl_main; + vlib_node_t *n, *n4, *n6; + + n = vlib_get_node_by_name (vm, (u8 *) "l2-input-classify"); + n4 = vlib_get_node_by_name (vm, (u8 *) "acl-plugin-in-ip4-l2"); + n6 = vlib_get_node_by_name (vm, (u8 *) "acl-plugin-in-ip6-l2"); + + + am->l2_input_classify_next_acl_ip4 = + vlib_node_add_next_with_slot (vm, n->index, n4->index, ~0); + am->l2_input_classify_next_acl_ip6 = + vlib_node_add_next_with_slot (vm, n->index, n6->index, ~0); + + feat_bitmap_init_next_nodes (vm, n4->index, L2INPUT_N_FEAT, + l2input_get_feat_names (), + am->fa_acl_in_ip4_l2_node_feat_next_node_index); + + feat_bitmap_init_next_nodes (vm, n6->index, L2INPUT_N_FEAT, + l2input_get_feat_names (), + am->fa_acl_in_ip6_l2_node_feat_next_node_index); + + + n = vlib_get_node_by_name (vm, (u8 *) "l2-output-classify"); + n4 = vlib_get_node_by_name (vm, (u8 *) "acl-plugin-out-ip4-l2"); + n6 = vlib_get_node_by_name (vm, (u8 *) "acl-plugin-out-ip6-l2"); + + am->l2_output_classify_next_acl_ip4 = + vlib_node_add_next_with_slot (vm, n->index, n4->index, ~0); + am->l2_output_classify_next_acl_ip6 = + vlib_node_add_next_with_slot (vm, n->index, n6->index, ~0); + + feat_bitmap_init_next_nodes (vm, n4->index, L2OUTPUT_N_FEAT, + l2output_get_feat_names (), + am->fa_acl_out_ip4_l2_node_feat_next_node_index); + + feat_bitmap_init_next_nodes (vm, n6->index, L2OUTPUT_N_FEAT, + l2output_get_feat_names (), + am->fa_acl_out_ip6_l2_node_feat_next_node_index); +} + +static void +acl_set_timeout_sec(int timeout_type, u32 value) +{ + acl_main_t *am = &acl_main; + clib_time_t *ct = &am->vlib_main->clib_time; + + if (timeout_type < ACL_N_TIMEOUTS) { + am->session_timeout_sec[timeout_type] = value; + } else { + clib_warning("Unknown timeout type %d", timeout_type); + return; + } + am->session_timeout[timeout_type] = (u64)(((f64)value)/ct->seconds_per_clock); +} + +static void +acl_set_session_max_entries(u32 value) +{ + acl_main_t *am = &acl_main; + am->fa_conn_table_max_entries = value; +} + +static int +acl_set_skip_ipv6_eh(u32 eh, u32 value) +{ + acl_main_t *am = &acl_main; + + if ((eh < 256) && (value < 2)) + { + am->fa_ipv6_known_eh_bitmap = clib_bitmap_set(am->fa_ipv6_known_eh_bitmap, eh, value); + return 1; + } + else + return 0; +} + + +static clib_error_t * +acl_sw_interface_add_del (vnet_main_t * vnm, u32 sw_if_index, u32 is_add) +{ + acl_main_t *am = &acl_main; + if (0 == am->acl_mheap) { + /* ACL heap is not initialized, so definitely nothing to do. */ + return 0; + } + if (0 == is_add) { + vlib_process_signal_event (am->vlib_main, am->fa_cleaner_node_index, + ACL_FA_CLEANER_DELETE_BY_SW_IF_INDEX, sw_if_index); + /* also unapply any ACLs in case the users did not do so. */ + macip_acl_interface_del_acl(am, sw_if_index); + acl_interface_reset_inout_acls (sw_if_index, 0); + acl_interface_reset_inout_acls (sw_if_index, 1); + } + return 0; +} + +VNET_SW_INTERFACE_ADD_DEL_FUNCTION (acl_sw_interface_add_del); + + + +static clib_error_t * +acl_set_aclplugin_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + clib_error_t *error = 0; + u32 timeout = 0; + u32 val = 0; + u32 eh_val = 0; + uword memory_size = 0; + acl_main_t *am = &acl_main; + + if (unformat (input, "skip-ipv6-extension-header %u %u", &eh_val, &val)) { + if(!acl_set_skip_ipv6_eh(eh_val, val)) { + error = clib_error_return(0, "expecting eh=0..255, value=0..1"); + } + goto done; + } + if (unformat (input, "use-hash-acl-matching %u", &val)) + { + am->use_hash_acl_matching = (val !=0); + goto done; + } + if (unformat (input, "l4-match-nonfirst-fragment %u", &val)) + { + am->l4_match_nonfirst_fragment = (val != 0); + goto done; + } + if (unformat (input, "heap")) + { + if (unformat(input, "main")) + { + if (unformat(input, "validate %u", &val)) + acl_plugin_acl_set_validate_heap(am, val); + else if (unformat(input, "trace %u", &val)) + acl_plugin_acl_set_trace_heap(am, val); + goto done; + } + else if (unformat(input, "hash")) + { + if (unformat(input, "validate %u", &val)) + acl_plugin_hash_acl_set_validate_heap(am, val); + else if (unformat(input, "trace %u", &val)) + acl_plugin_hash_acl_set_trace_heap(am, val); + goto done; + } + goto done; + } + if (unformat (input, "session")) { + if (unformat (input, "table")) { + /* The commands here are for tuning/testing. No user-serviceable parts inside */ + if (unformat (input, "max-entries")) { + if (!unformat(input, "%u", &val)) { + error = clib_error_return(0, + "expecting maximum number of entries, got `%U`", + format_unformat_error, input); + goto done; + } else { + acl_set_session_max_entries(val); + goto done; + } + } + if (unformat (input, "hash-table-buckets")) { + if (!unformat(input, "%u", &val)) { + error = clib_error_return(0, + "expecting maximum number of hash table buckets, got `%U`", + format_unformat_error, input); + goto done; + } else { + am->fa_conn_table_hash_num_buckets = val; + goto done; + } + } + if (unformat (input, "hash-table-memory")) { + if (!unformat(input, "%U", unformat_memory_size, &memory_size)) { + error = clib_error_return(0, + "expecting maximum amount of hash table memory, got `%U`", + format_unformat_error, input); + goto done; + } else { + am->fa_conn_table_hash_memory_size = memory_size; + goto done; + } + } + goto done; + } + if (unformat (input, "timeout")) { + if (unformat(input, "udp")) { + if(unformat(input, "idle")) { + if (!unformat(input, "%u", &timeout)) { + error = clib_error_return(0, + "expecting timeout value in seconds, got `%U`", + format_unformat_error, input); + goto done; + } else { + acl_set_timeout_sec(ACL_TIMEOUT_UDP_IDLE, timeout); + goto done; + } + } + } + if (unformat(input, "tcp")) { + if(unformat(input, "idle")) { + if (!unformat(input, "%u", &timeout)) { + error = clib_error_return(0, + "expecting timeout value in seconds, got `%U`", + format_unformat_error, input); + goto done; + } else { + acl_set_timeout_sec(ACL_TIMEOUT_TCP_IDLE, timeout); + goto done; + } + } + if(unformat(input, "transient")) { + if (!unformat(input, "%u", &timeout)) { + error = clib_error_return(0, + "expecting timeout value in seconds, got `%U`", + format_unformat_error, input); + goto done; + } else { + acl_set_timeout_sec(ACL_TIMEOUT_TCP_TRANSIENT, timeout); + goto done; + } + } + } + goto done; + } + } +done: + return error; +} + +static u8 * +my_format_mac_address (u8 * s, va_list * args) +{ + u8 *a = va_arg (*args, u8 *); + return format (s, "%02x:%02x:%02x:%02x:%02x:%02x", + a[0], a[1], a[2], a[3], a[4], a[5]); +} + +static inline u8 * +my_macip_acl_rule_t_pretty_format (u8 *out, va_list *args) +{ + macip_acl_rule_t *a = va_arg (*args, macip_acl_rule_t *); + + out = format(out, "%s action %d ip %U/%d mac %U mask %U", + a->is_ipv6 ? "ipv6" : "ipv4", a->is_permit, + format_ip46_address, &a->src_ip_addr, IP46_TYPE_ANY, + a->src_prefixlen, + my_format_mac_address, a->src_mac, + my_format_mac_address, a->src_mac_mask); + return(out); +} + +static void +macip_acl_print(acl_main_t *am, u32 macip_acl_index) +{ + vlib_main_t * vm = am->vlib_main; + int i; + + /* Don't try to print someone else's memory */ + if (macip_acl_index > vec_len(am->macip_acls)) + return; + + macip_acl_list_t *a = vec_elt_at_index(am->macip_acls, macip_acl_index); + int free_pool_slot = pool_is_free_index(am->macip_acls, macip_acl_index); + + vlib_cli_output(vm, "MACIP acl_index: %d, count: %d (true len %d) tag {%s} is free pool slot: %d\n", + macip_acl_index, a->count, vec_len(a->rules), a->tag, free_pool_slot); + vlib_cli_output(vm, " ip4_table_index %d, ip6_table_index %d, l2_table_index %d\n", + a->ip4_table_index, a->ip6_table_index, a->l2_table_index); + for(i=0; i<vec_len(a->rules); i++) + vlib_cli_output(vm, " rule %d: %U\n", i, my_macip_acl_rule_t_pretty_format, + vec_elt_at_index(a->rules, i)); + +} + +static clib_error_t * +acl_show_aclplugin_macip_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + clib_error_t *error = 0; + acl_main_t *am = &acl_main; + int i; + if (unformat (input, "interface")) + { + for(i=0; i < vec_len(am->macip_acl_by_sw_if_index); i++) + { + vlib_cli_output(vm, " sw_if_index %d: %d\n", i, vec_elt(am->macip_acl_by_sw_if_index, i)); + } + } + else if (unformat (input, "acl")) + { + for(i=0; i < vec_len(am->macip_acls); i++) + macip_acl_print(am, i); + } + return error; +} + + +static clib_error_t * +acl_show_aclplugin_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + clib_error_t *error = 0; + acl_main_t *am = &acl_main; + vnet_interface_main_t *im = &am->vnet_main->interface_main; + u32 *pj; + + vnet_sw_interface_t *swif; + + if (unformat (input, "sessions")) + { + u8 * out0 = format(0, ""); + u16 wk; + u32 show_bihash_verbose = 0; + u32 show_session_thread_id = ~0; + u32 show_session_session_index = ~0; + unformat (input, "thread %u index %u", &show_session_thread_id, &show_session_session_index); + unformat (input, "verbose %u", &show_bihash_verbose); + { + u64 n_adds = am->fa_session_total_adds; + u64 n_dels = am->fa_session_total_dels; + out0 = format(out0, "Sessions total: add %lu - del %lu = %lu\n", n_adds, n_dels, n_adds - n_dels); + } + out0 = format(out0, "\n\nPer-thread data:\n"); + for (wk = 0; wk < vec_len (am->per_worker_data); wk++) { + acl_fa_per_worker_data_t *pw = &am->per_worker_data[wk]; + out0 = format(out0, "Thread #%d:\n", wk); + if (show_session_thread_id == wk && show_session_session_index < pool_len(pw->fa_sessions_pool)) { + out0 = format(out0, " session index %u:\n", show_session_session_index); + fa_session_t *sess = pw->fa_sessions_pool + show_session_session_index; + u64 *m = (u64 *)&sess->info; + out0 = format(out0, " info: %016llx %016llx %016llx %016llx %016llx %016llx\n", m[0], m[1], m[2], m[3], m[4], m[5]); + out0 = format(out0, " sw_if_index: %u\n", sess->sw_if_index); + out0 = format(out0, " tcp_flags_seen: %x\n", sess->tcp_flags_seen.as_u16); + out0 = format(out0, " last active time: %lu\n", sess->last_active_time); + out0 = format(out0, " thread index: %u\n", sess->thread_index); + out0 = format(out0, " link enqueue time: %lu\n", sess->link_enqueue_time); + out0 = format(out0, " link next index: %u\n", sess->link_next_idx); + out0 = format(out0, " link prev index: %u\n", sess->link_prev_idx); + out0 = format(out0, " link list id: %u\n", sess->link_list_id); + } + out0 = format(out0, " connection add/del stats:\n", wk); + pool_foreach (swif, im->sw_interfaces, + ({ + u32 sw_if_index = swif->sw_if_index; + u64 n_adds = sw_if_index < vec_len(pw->fa_session_adds_by_sw_if_index) ? pw->fa_session_adds_by_sw_if_index[sw_if_index] : 0; + u64 n_dels = sw_if_index < vec_len(pw->fa_session_dels_by_sw_if_index) ? pw->fa_session_dels_by_sw_if_index[sw_if_index] : 0; + out0 = format(out0, " sw_if_index %d: add %lu - del %lu = %lu\n", sw_if_index, n_adds, n_dels, n_adds - n_dels); + })); + + out0 = format(out0, " connection timeout type lists:\n", wk); + u8 tt = 0; + for(tt = 0; tt < ACL_N_TIMEOUTS; tt++) { + u32 head_session_index = pw->fa_conn_list_head[tt]; + out0 = format(out0, " fa_conn_list_head[%d]: %d\n", tt, head_session_index); + if (~0 != head_session_index) { + fa_session_t *sess = pw->fa_sessions_pool + head_session_index; + out0 = format(out0, " last active time: %lu\n", sess->last_active_time); + out0 = format(out0, " link enqueue time: %lu\n", sess->link_enqueue_time); + } + } + + out0 = format(out0, " Next expiry time: %lu\n", pw->next_expiry_time); + out0 = format(out0, " Requeue until time: %lu\n", pw->requeue_until_time); + out0 = format(out0, " Current time wait interval: %lu\n", pw->current_time_wait_interval); + out0 = format(out0, " Count of deleted sessions: %lu\n", pw->cnt_deleted_sessions); + out0 = format(out0, " Delete already deleted: %lu\n", pw->cnt_already_deleted_sessions); + out0 = format(out0, " Session timers restarted: %lu\n", pw->cnt_session_timer_restarted); + out0 = format(out0, " Swipe until this time: %lu\n", pw->swipe_end_time); + out0 = format(out0, " sw_if_index serviced bitmap: %U\n", format_bitmap_hex, pw->serviced_sw_if_index_bitmap); + out0 = format(out0, " pending clear intfc bitmap : %U\n", format_bitmap_hex, pw->pending_clear_sw_if_index_bitmap); + out0 = format(out0, " clear in progress: %u\n", pw->clear_in_process); + out0 = format(out0, " interrupt is pending: %d\n", pw->interrupt_is_pending); + out0 = format(out0, " interrupt is needed: %d\n", pw->interrupt_is_needed); + out0 = format(out0, " interrupt is unwanted: %d\n", pw->interrupt_is_unwanted); + out0 = format(out0, " interrupt generation: %d\n", pw->interrupt_generation); + } + out0 = format(out0, "\n\nConn cleaner thread counters:\n"); +#define _(cnt, desc) out0 = format(out0, " %20lu: %s\n", am->cnt, desc); + foreach_fa_cleaner_counter; +#undef _ + vec_terminate_c_string(out0); + vlib_cli_output(vm, "\n\n%s\n\n", out0); + vlib_cli_output(vm, "Interrupt generation: %d\n", am->fa_interrupt_generation); + vlib_cli_output(vm, "Sessions per interval: min %lu max %lu increment: %f ms current: %f ms", + am->fa_min_deleted_sessions_per_interval, am->fa_max_deleted_sessions_per_interval, + am->fa_cleaner_wait_time_increment * 1000.0, ((f64)am->fa_current_cleaner_timer_wait_interval) * 1000.0/(f64)vm->clib_time.clocks_per_second); + + vec_free(out0); + show_fa_sessions_hash(vm, show_bihash_verbose); + } + else if (unformat (input, "interface")) + { + u32 sw_if_index = ~0; + u32 swi; + u8 * out0 = format(0, ""); + unformat (input, "sw_if_index %u", &sw_if_index); + for(swi = 0; (swi < vec_len(am->input_acl_vec_by_sw_if_index)) || + (swi < vec_len(am->output_acl_vec_by_sw_if_index)); swi++) { + out0 = format(out0, "sw_if_index %d:\n", swi); + + if ((swi < vec_len(am->input_acl_vec_by_sw_if_index)) && + (vec_len(am->input_acl_vec_by_sw_if_index[swi]) > 0)) { + out0 = format(out0, " input acl(s): "); + vec_foreach(pj, am->input_acl_vec_by_sw_if_index[swi]) { + out0 = format(out0, "%d ", *pj); + } + out0 = format(out0, "\n"); + } + + if ((swi < vec_len(am->output_acl_vec_by_sw_if_index)) && + (vec_len(am->output_acl_vec_by_sw_if_index[swi]) > 0)) { + out0 = format(out0, " output acl(s): "); + vec_foreach(pj, am->output_acl_vec_by_sw_if_index[swi]) { + out0 = format(out0, "%d ", *pj); + } + out0 = format(out0, "\n"); + } + + } + vec_terminate_c_string(out0); + vlib_cli_output(vm, "\n%s\n", out0); + vec_free(out0); + } + else if (unformat (input, "acl")) + { + u32 acl_index = ~0; + u32 i; + u8 * out0 = format(0, ""); + unformat (input, "index %u", &acl_index); + for(i=0; i<vec_len(am->acls); i++) { + if (acl_is_not_defined(am, i)) { + /* don't attempt to show the ACLs that do not exist */ + continue; + } + if ((acl_index != ~0) && (acl_index != i)) { + continue; + } + out0 = format(out0, "acl-index %u count %u tag {%s}\n", i, am->acls[i].count, am->acls[i].tag); + acl_rule_t *r; + int j; + for(j=0; j<am->acls[i].count; j++) { + r = &am->acls[i].rules[j]; + out0 = format(out0, " %4d: %s ", j, r->is_ipv6 ? "ipv6" : "ipv4"); + out0 = format_acl_action(out0, r->is_permit); + out0 = format(out0, " src %U/%d", format_ip46_address, &r->src, IP46_TYPE_ANY, r->src_prefixlen); + out0 = format(out0, " dst %U/%d", format_ip46_address, &r->dst, IP46_TYPE_ANY, r->dst_prefixlen); + out0 = format(out0, " proto %d", r->proto); + out0 = format(out0, " sport %d", r->src_port_or_type_first); + if (r->src_port_or_type_first != r->src_port_or_type_last) { + out0 = format(out0, "-%d", r->src_port_or_type_last); + } + out0 = format(out0, " dport %d", r->dst_port_or_code_first); + if (r->dst_port_or_code_first != r->dst_port_or_code_last) { + out0 = format(out0, "-%d", r->dst_port_or_code_last); + } + if (r->tcp_flags_mask || r->tcp_flags_value) { + out0 = format(out0, " tcpflags %d mask %d", r->tcp_flags_value, r->tcp_flags_mask); + } + out0 = format(out0, "\n"); + } + + if (i<vec_len(am->input_sw_if_index_vec_by_acl)) { + out0 = format(out0, " applied inbound on sw_if_index: "); + vec_foreach(pj, am->input_sw_if_index_vec_by_acl[i]) { + out0 = format(out0, "%d ", *pj); + } + out0 = format(out0, "\n"); + } + if (i<vec_len(am->output_sw_if_index_vec_by_acl)) { + out0 = format(out0, " applied outbound on sw_if_index: "); + vec_foreach(pj, am->output_sw_if_index_vec_by_acl[i]) { + out0 = format(out0, "%d ", *pj); + } + out0 = format(out0, "\n"); + } + } + vec_terminate_c_string(out0); + vlib_cli_output(vm, "\n%s\n", out0); + vec_free(out0); + } + else if (unformat (input, "memory")) + { + vlib_cli_output (vm, "ACL plugin main heap statistics:\n"); + if (am->acl_mheap) { + vlib_cli_output (vm, " %U\n", format_mheap, am->acl_mheap, 1); + } else { + vlib_cli_output (vm, " Not initialized\n"); + } + vlib_cli_output (vm, "ACL hash lookup support heap statistics:\n"); + if (am->hash_lookup_mheap) { + vlib_cli_output (vm, " %U\n", format_mheap, am->hash_lookup_mheap, 1); + } else { + vlib_cli_output (vm, " Not initialized\n"); + } + } + else if (unformat (input, "tables")) + { + ace_mask_type_entry_t *mte; + u32 acl_index = ~0; + u32 sw_if_index = ~0; + int show_acl_hash_info = 0; + int show_applied_info = 0; + int show_mask_type = 0; + int show_bihash = 0; + u32 show_bihash_verbose = 0; + + if (unformat (input, "acl")) { + show_acl_hash_info = 1; + /* mask-type is handy to see as well right there */ + show_mask_type = 1; + unformat (input, "index %u", &acl_index); + } else if (unformat (input, "applied")) { + show_applied_info = 1; + unformat (input, "sw_if_index %u", &sw_if_index); + } else if (unformat (input, "mask")) { + show_mask_type = 1; + } else if (unformat (input, "hash")) { + show_bihash = 1; + unformat (input, "verbose %u", &show_bihash_verbose); + } + + if ( ! (show_mask_type || show_acl_hash_info || show_applied_info || show_bihash) ) { + /* if no qualifiers specified, show all */ + show_mask_type = 1; + show_acl_hash_info = 1; + show_applied_info = 1; + show_bihash = 1; + } + + if (show_mask_type) { + vlib_cli_output(vm, "Mask-type entries:"); + /* *INDENT-OFF* */ + pool_foreach(mte, am->ace_mask_type_pool, + ({ + vlib_cli_output(vm, " %3d: %016llx %016llx %016llx %016llx %016llx %016llx refcount %d", + mte - am->ace_mask_type_pool, + mte->mask.kv.key[0], mte->mask.kv.key[1], mte->mask.kv.key[2], + mte->mask.kv.key[3], mte->mask.kv.key[4], mte->mask.kv.value, mte->refcount); + })); + /* *INDENT-ON* */ + } + + if (show_acl_hash_info) { + u32 i,j; + u8 * out0 = format(0, ""); + u64 *m; + out0 = format(out0, "Mask-ready ACL representations\n"); + for (i=0; i< vec_len(am->hash_acl_infos); i++) { + if ((acl_index != ~0) && (acl_index != i)) { + continue; + } + hash_acl_info_t *ha = &am->hash_acl_infos[i]; + out0 = format(out0, "acl-index %u bitmask-ready layout\n", i); + out0 = format(out0, " applied inbound on sw_if_index list: %U\n", format_vec32, ha->inbound_sw_if_index_list, "%d"); + out0 = format(out0, " applied outbound on sw_if_index list: %U\n", format_vec32, ha->outbound_sw_if_index_list, "%d"); + out0 = format(out0, " mask type index bitmap: %U\n", format_bitmap_hex, ha->mask_type_index_bitmap); + for(j=0; j<vec_len(ha->rules); j++) { + hash_ace_info_t *pa = &ha->rules[j]; + m = (u64 *)&pa->match; + out0 = format(out0, " %4d: %016llx %016llx %016llx %016llx %016llx %016llx mask index %d acl %d rule %d action %d src/dst portrange not ^2: %d,%d\n", + j, m[0], m[1], m[2], m[3], m[4], m[5], pa->mask_type_index, + pa->acl_index, pa->ace_index, pa->action, + pa->src_portrange_not_powerof2, pa->dst_portrange_not_powerof2); + } + } + vec_terminate_c_string(out0); + vlib_cli_output(vm, "\n%s\n", out0); + vec_free(out0); + } + + if (show_applied_info) { + u32 swi, j; + u8 * out0 = format(0, ""); + out0 = format(out0, "Applied lookup entries for interfaces\n"); + + for(swi = 0; (swi < vec_len(am->input_applied_hash_acl_info_by_sw_if_index)) || + (swi < vec_len(am->output_applied_hash_acl_info_by_sw_if_index)) || + (swi < vec_len(am->input_hash_entry_vec_by_sw_if_index)) || + (swi < vec_len(am->output_hash_entry_vec_by_sw_if_index)); swi++) { + if ((sw_if_index != ~0) && (sw_if_index != swi)) { + continue; + } + out0 = format(out0, "sw_if_index %d:\n", swi); + if (swi < vec_len(am->input_applied_hash_acl_info_by_sw_if_index)) { + applied_hash_acl_info_t *pal = &am->input_applied_hash_acl_info_by_sw_if_index[swi]; + out0 = format(out0, " input lookup mask_type_index_bitmap: %U\n", format_bitmap_hex, pal->mask_type_index_bitmap); + out0 = format(out0, " input applied acls: %U\n", format_vec32, pal->applied_acls, "%d"); + } + if (swi < vec_len(am->input_hash_entry_vec_by_sw_if_index)) { + out0 = format(out0, " input lookup applied entries:\n"); + for(j=0; j<vec_len(am->input_hash_entry_vec_by_sw_if_index[swi]); j++) { + applied_hash_ace_entry_t *pae = &am->input_hash_entry_vec_by_sw_if_index[swi][j]; + out0 = format(out0, " %4d: acl %d rule %d action %d bitmask-ready rule %d next %d prev %d tail %d hitcount %lld\n", + j, pae->acl_index, pae->ace_index, pae->action, pae->hash_ace_info_index, + pae->next_applied_entry_index, pae->prev_applied_entry_index, pae->tail_applied_entry_index, pae->hitcount); + } + } + + if (swi < vec_len(am->output_applied_hash_acl_info_by_sw_if_index)) { + applied_hash_acl_info_t *pal = &am->output_applied_hash_acl_info_by_sw_if_index[swi]; + out0 = format(out0, " output lookup mask_type_index_bitmap: %U\n", format_bitmap_hex, pal->mask_type_index_bitmap); + out0 = format(out0, " output applied acls: %U\n", format_vec32, pal->applied_acls, "%d"); + } + if (swi < vec_len(am->output_hash_entry_vec_by_sw_if_index)) { + out0 = format(out0, " output lookup applied entries:\n"); + for(j=0; j<vec_len(am->output_hash_entry_vec_by_sw_if_index[swi]); j++) { + applied_hash_ace_entry_t *pae = &am->output_hash_entry_vec_by_sw_if_index[swi][j]; + out0 = format(out0, " %4d: acl %d rule %d action %d bitmask-ready rule %d next %d prev %d tail %d hitcount %lld\n", + j, pae->acl_index, pae->ace_index, pae->action, pae->hash_ace_info_index, + pae->next_applied_entry_index, pae->prev_applied_entry_index, pae->tail_applied_entry_index, pae->hitcount); + } + } + + } + vec_terminate_c_string(out0); + vlib_cli_output(vm, "\n%s\n", out0); + vec_free(out0); + } + + if (show_bihash) { + show_hash_acl_hash(vm, am, show_bihash_verbose); + } + } + return error; +} + +static clib_error_t * +acl_clear_aclplugin_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + clib_error_t *error = 0; + acl_main_t *am = &acl_main; + vlib_process_signal_event (am->vlib_main, am->fa_cleaner_node_index, + ACL_FA_CLEANER_DELETE_BY_SW_IF_INDEX, ~0); + return error; +} + + /* *INDENT-OFF* */ +VLIB_CLI_COMMAND (aclplugin_set_command, static) = { + .path = "set acl-plugin", + .short_help = "set acl-plugin session timeout {{udp idle}|tcp {idle|transient}} <seconds>", + .function = acl_set_aclplugin_fn, +}; + +VLIB_CLI_COMMAND (aclplugin_show_command, static) = { + .path = "show acl-plugin", + .short_help = "show acl-plugin {sessions|acl|interface|tables}", + .function = acl_show_aclplugin_fn, +}; + +VLIB_CLI_COMMAND (aclplugin_show_macip_command, static) = { + .path = "show acl-plugin macip", + .short_help = "show acl-plugin macip {acl|interface}", + .function = acl_show_aclplugin_macip_fn, +}; + + +VLIB_CLI_COMMAND (aclplugin_clear_command, static) = { + .path = "clear acl-plugin sessions", + .short_help = "clear acl-plugin sessions", + .function = acl_clear_aclplugin_fn, +}; +/* *INDENT-ON* */ + +static clib_error_t * +acl_plugin_config (vlib_main_t * vm, unformat_input_t * input) +{ + acl_main_t *am = &acl_main; + u32 conn_table_hash_buckets; + u32 conn_table_hash_memory_size; + u32 conn_table_max_entries; + u32 main_heap_size; + u32 hash_heap_size; + u32 hash_lookup_hash_buckets; + u32 hash_lookup_hash_memory; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "connection hash buckets %d", &conn_table_hash_buckets)) + am->fa_conn_table_hash_num_buckets = conn_table_hash_buckets; + else if (unformat (input, "connection hash memory %d", + &conn_table_hash_memory_size)) + am->fa_conn_table_hash_memory_size = conn_table_hash_memory_size; + else if (unformat (input, "connection count max %d", + &conn_table_max_entries)) + am->fa_conn_table_max_entries = conn_table_max_entries; + else if (unformat (input, "main heap size %d", + &main_heap_size)) + am->acl_mheap_size = main_heap_size; + else if (unformat (input, "hash lookup heap size %d", + &hash_heap_size)) + am->hash_lookup_mheap_size = hash_heap_size; + else if (unformat (input, "hash lookup hash buckets %d", + &hash_lookup_hash_buckets)) + am->hash_lookup_hash_buckets = hash_lookup_hash_buckets; + else if (unformat (input, "hash lookup hash memory %d", + &hash_lookup_hash_memory)) + am->hash_lookup_hash_memory = hash_lookup_hash_memory; + else + return clib_error_return (0, "unknown input '%U'", + format_unformat_error, input); + } + return 0; +} +VLIB_CONFIG_FUNCTION (acl_plugin_config, "acl-plugin"); + +static clib_error_t * +acl_init (vlib_main_t * vm) +{ + acl_main_t *am = &acl_main; + clib_error_t *error = 0; + memset (am, 0, sizeof (*am)); + am->vlib_main = vm; + am->vnet_main = vnet_get_main (); + + u8 *name = format (0, "acl_%08x%c", api_version, 0); + + /* Ask for a correctly-sized block of API message decode slots */ + am->msg_id_base = vl_msg_api_get_msg_ids ((char *) name, + VL_MSG_FIRST_AVAILABLE); + + error = acl_plugin_api_hookup (vm); + + /* Add our API messages to the global name_crc hash table */ + setup_message_id_table (am, &api_main); + + vec_free (name); + + acl_setup_fa_nodes(); + + am->acl_mheap_size = ACL_FA_DEFAULT_HEAP_SIZE; + am->hash_lookup_mheap_size = ACL_PLUGIN_HASH_LOOKUP_HEAP_SIZE; + + am->hash_lookup_hash_buckets = ACL_PLUGIN_HASH_LOOKUP_HASH_BUCKETS; + am->hash_lookup_hash_memory = ACL_PLUGIN_HASH_LOOKUP_HASH_MEMORY; + + am->session_timeout_sec[ACL_TIMEOUT_TCP_TRANSIENT] = TCP_SESSION_TRANSIENT_TIMEOUT_SEC; + am->session_timeout_sec[ACL_TIMEOUT_TCP_IDLE] = TCP_SESSION_IDLE_TIMEOUT_SEC; + am->session_timeout_sec[ACL_TIMEOUT_UDP_IDLE] = UDP_SESSION_IDLE_TIMEOUT_SEC; + + am->fa_conn_table_hash_num_buckets = ACL_FA_CONN_TABLE_DEFAULT_HASH_NUM_BUCKETS; + am->fa_conn_table_hash_memory_size = ACL_FA_CONN_TABLE_DEFAULT_HASH_MEMORY_SIZE; + am->fa_conn_table_max_entries = ACL_FA_CONN_TABLE_DEFAULT_MAX_ENTRIES; + vlib_thread_main_t *tm = vlib_get_thread_main (); + vec_validate(am->per_worker_data, tm->n_vlib_mains-1); + { + u16 wk; + u8 tt; + for (wk = 0; wk < vec_len (am->per_worker_data); wk++) { + acl_fa_per_worker_data_t *pw = &am->per_worker_data[wk]; + vec_validate(pw->fa_conn_list_head, ACL_N_TIMEOUTS-1); + vec_validate(pw->fa_conn_list_tail, ACL_N_TIMEOUTS-1); + for(tt = 0; tt < ACL_N_TIMEOUTS; tt++) { + pw->fa_conn_list_head[tt] = ~0; + pw->fa_conn_list_tail[tt] = ~0; + } + } + } + + am->fa_min_deleted_sessions_per_interval = ACL_FA_DEFAULT_MIN_DELETED_SESSIONS_PER_INTERVAL; + am->fa_max_deleted_sessions_per_interval = ACL_FA_DEFAULT_MAX_DELETED_SESSIONS_PER_INTERVAL; + am->fa_cleaner_wait_time_increment = ACL_FA_DEFAULT_CLEANER_WAIT_TIME_INCREMENT; + + am->fa_cleaner_cnt_delete_by_sw_index = 0; + am->fa_cleaner_cnt_delete_by_sw_index_ok = 0; + am->fa_cleaner_cnt_unknown_event = 0; + am->fa_cleaner_cnt_timer_restarted = 0; + am->fa_cleaner_cnt_wait_with_timeout = 0; + + +#define _(N, v, s) am->fa_ipv6_known_eh_bitmap = clib_bitmap_set(am->fa_ipv6_known_eh_bitmap, v, 1); + foreach_acl_eh +#undef _ + + am->l4_match_nonfirst_fragment = 1; + + /* use the new fancy hash-based matching */ + am->use_hash_acl_matching = 1; + + return error; +} + +VLIB_INIT_FUNCTION (acl_init); diff --git a/src/plugins/acl/acl.h b/src/plugins/acl/acl.h new file mode 100644 index 00000000..bed22e5f --- /dev/null +++ b/src/plugins/acl/acl.h @@ -0,0 +1,316 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef included_acl_h +#define included_acl_h + +#include <vnet/vnet.h> +#include <vnet/ip/ip.h> +#include <vnet/l2/l2_output.h> + + +#include <vppinfra/hash.h> +#include <vppinfra/error.h> +#include <vppinfra/bitmap.h> +#include <vppinfra/elog.h> +#include <vppinfra/bihash_48_8.h> +#include <vppinfra/bihash_40_8.h> + +#include "fa_node.h" +#include "hash_lookup_types.h" + +#define ACL_PLUGIN_VERSION_MAJOR 1 +#define ACL_PLUGIN_VERSION_MINOR 3 + +#define UDP_SESSION_IDLE_TIMEOUT_SEC 600 +#define TCP_SESSION_IDLE_TIMEOUT_SEC (3600*24) +#define TCP_SESSION_TRANSIENT_TIMEOUT_SEC 120 + +#define ACL_FA_DEFAULT_HEAP_SIZE (2 << 29) + +#define ACL_PLUGIN_HASH_LOOKUP_HEAP_SIZE (2 << 25) +#define ACL_PLUGIN_HASH_LOOKUP_HASH_BUCKETS 65536 +#define ACL_PLUGIN_HASH_LOOKUP_HASH_MEMORY (2 << 25) + +extern vlib_node_registration_t acl_in_node; +extern vlib_node_registration_t acl_out_node; + +void input_acl_packet_match(u32 sw_if_index, vlib_buffer_t * b0, u32 *nextp, u32 *acl_match_p, u32 *rule_match_p, u32 *trace_bitmap); +void output_acl_packet_match(u32 sw_if_index, vlib_buffer_t * b0, u32 *nextp, u32 *acl_match_p, u32 *rule_match_p, u32 *trace_bitmap); + +enum acl_timeout_e { + ACL_TIMEOUT_UDP_IDLE = 0, + ACL_TIMEOUT_TCP_IDLE, + ACL_TIMEOUT_TCP_TRANSIENT, + ACL_N_TIMEOUTS +}; + + +enum address_e { IP4, IP6 }; +typedef struct +{ + enum address_e type; + union { + ip6_address_t ip6; + ip4_address_t ip4; + } addr; +} address_t; + +/* + * ACL rules + */ +typedef struct +{ + u8 is_permit; + u8 is_ipv6; + ip46_address_t src; + u8 src_prefixlen; + ip46_address_t dst; + u8 dst_prefixlen; + u8 proto; + u16 src_port_or_type_first; + u16 src_port_or_type_last; + u16 dst_port_or_code_first; + u16 dst_port_or_code_last; + u8 tcp_flags_value; + u8 tcp_flags_mask; +} acl_rule_t; + +typedef struct +{ + u8 is_permit; + u8 is_ipv6; + u8 src_mac[6]; + u8 src_mac_mask[6]; + ip46_address_t src_ip_addr; + u8 src_prefixlen; +} macip_acl_rule_t; + +/* + * ACL + */ +typedef struct +{ + u8 tag[64]; + u32 count; + acl_rule_t *rules; +} acl_list_t; + +typedef struct +{ + u8 tag[64]; + u32 count; + macip_acl_rule_t *rules; + /* References to the classifier tables that will enforce the rules */ + u32 ip4_table_index; + u32 ip6_table_index; + u32 l2_table_index; +} macip_acl_list_t; + +/* + * An element describing a particular configuration fo the mask, + * and how many times it has been used. + */ +typedef struct +{ + fa_5tuple_t mask; + u32 refcount; +} ace_mask_type_entry_t; + +typedef struct { + /* mheap to hold all the ACL module related allocations, other than hash */ + void *acl_mheap; + u32 acl_mheap_size; + + /* API message ID base */ + u16 msg_id_base; + + acl_list_t *acls; /* Pool of ACLs */ + hash_acl_info_t *hash_acl_infos; /* corresponding hash matching housekeeping info */ + clib_bihash_48_8_t acl_lookup_hash; /* ACL lookup hash table. */ + u32 hash_lookup_hash_buckets; + u32 hash_lookup_hash_memory; + + /* mheap to hold all the miscellaneous allocations related to hash-based lookups */ + void *hash_lookup_mheap; + u32 hash_lookup_mheap_size; + int acl_lookup_hash_initialized; + applied_hash_ace_entry_t **input_hash_entry_vec_by_sw_if_index; + applied_hash_ace_entry_t **output_hash_entry_vec_by_sw_if_index; + applied_hash_acl_info_t *input_applied_hash_acl_info_by_sw_if_index; + applied_hash_acl_info_t *output_applied_hash_acl_info_by_sw_if_index; + + macip_acl_list_t *macip_acls; /* Pool of MAC-IP ACLs */ + + /* ACLs associated with interfaces */ + u32 **input_acl_vec_by_sw_if_index; + u32 **output_acl_vec_by_sw_if_index; + + /* interfaces on which given ACLs are applied */ + u32 **input_sw_if_index_vec_by_acl; + u32 **output_sw_if_index_vec_by_acl; + + /* Total count of interface+direction pairs enabled */ + u32 fa_total_enabled_count; + + /* Do we use hash-based ACL matching or linear */ + int use_hash_acl_matching; + + /* a pool of all mask types present in all ACEs */ + ace_mask_type_entry_t *ace_mask_type_pool; + + /* + * Classify tables used to grab the packets for the ACL check, + * and serving as the 5-tuple session tables at the same time + */ + u32 *acl_ip4_input_classify_table_by_sw_if_index; + u32 *acl_ip6_input_classify_table_by_sw_if_index; + u32 *acl_ip4_output_classify_table_by_sw_if_index; + u32 *acl_ip6_output_classify_table_by_sw_if_index; + + /* MACIP (input) ACLs associated with the interfaces */ + u32 *macip_acl_by_sw_if_index; + + /* bitmaps when set the processing is enabled on the interface */ + uword *fa_in_acl_on_sw_if_index; + uword *fa_out_acl_on_sw_if_index; + /* bihash holding all of the sessions */ + int fa_sessions_hash_is_initialized; + clib_bihash_40_8_t fa_sessions_hash; + /* The process node which orcherstrates the cleanup */ + u32 fa_cleaner_node_index; + /* FA session timeouts, in seconds */ + u32 session_timeout_sec[ACL_N_TIMEOUTS]; + /* total session adds/dels */ + u64 fa_session_total_adds; + u64 fa_session_total_dels; + + /* L2 datapath glue */ + + /* next indices within L2 classifiers for ip4/ip6 fa L2 nodes */ + u32 l2_input_classify_next_acl_ip4; + u32 l2_input_classify_next_acl_ip6; + u32 l2_output_classify_next_acl_ip4; + u32 l2_output_classify_next_acl_ip6; + /* next node indices for L2 dispatch */ + u32 fa_acl_in_ip4_l2_node_feat_next_node_index[32]; + u32 fa_acl_in_ip6_l2_node_feat_next_node_index[32]; + u32 fa_acl_out_ip4_l2_node_feat_next_node_index[32]; + u32 fa_acl_out_ip6_l2_node_feat_next_node_index[32]; + + /* EH values that we can skip over */ + uword *fa_ipv6_known_eh_bitmap; + + /* whether to match L4 ACEs with ports on the non-initial fragment */ + int l4_match_nonfirst_fragment; + + /* conn table per-interface conn table parameters */ + u32 fa_conn_table_hash_num_buckets; + uword fa_conn_table_hash_memory_size; + u64 fa_conn_table_max_entries; + + /* + * If the cleaner has to delete more than this number + * of connections, it halves the sleep time. + */ + +#define ACL_FA_DEFAULT_MAX_DELETED_SESSIONS_PER_INTERVAL 100 + u64 fa_max_deleted_sessions_per_interval; + + /* + * If the cleaner deletes less than these connections, + * it increases the wait time by the "increment" + */ + +#define ACL_FA_DEFAULT_MIN_DELETED_SESSIONS_PER_INTERVAL 1 + u64 fa_min_deleted_sessions_per_interval; + +#define ACL_FA_DEFAULT_CLEANER_WAIT_TIME_INCREMENT 0.1 + f64 fa_cleaner_wait_time_increment; + + u64 fa_current_cleaner_timer_wait_interval; + + int fa_interrupt_generation; + + /* per-worker data related t conn management */ + acl_fa_per_worker_data_t *per_worker_data; + + /* Configured session timeout */ + u64 session_timeout[ACL_N_TIMEOUTS]; + + + /* Counters for the cleaner thread */ + +#define foreach_fa_cleaner_counter \ + _(fa_cleaner_cnt_delete_by_sw_index, "delete_by_sw_index events") \ + _(fa_cleaner_cnt_delete_by_sw_index_ok, "delete_by_sw_index handled ok") \ + _(fa_cleaner_cnt_unknown_event, "unknown events received") \ + _(fa_cleaner_cnt_timer_restarted, "session idle timers restarted") \ + _(fa_cleaner_cnt_wait_with_timeout, "event wait with timeout called") \ + _(fa_cleaner_cnt_wait_without_timeout, "event wait w/o timeout called") \ + _(fa_cleaner_cnt_event_cycles, "total event cycles") \ +/* end of counters */ +#define _(id, desc) u32 id; + foreach_fa_cleaner_counter +#undef _ + + /* convenience */ + vlib_main_t * vlib_main; + vnet_main_t * vnet_main; +} acl_main_t; + +#define foreach_acl_eh \ + _(HOPBYHOP , 0 , "IPv6ExtHdrHopByHop") \ + _(ROUTING , 43 , "IPv6ExtHdrRouting") \ + _(DESTOPT , 60 , "IPv6ExtHdrDestOpt") \ + _(FRAGMENT , 44 , "IPv6ExtHdrFragment") \ + _(MOBILITY , 135, "Mobility Header") \ + _(HIP , 139, "Experimental use Host Identity Protocol") \ + _(SHIM6 , 140, "Shim6 Protocol") \ + _(EXP1 , 253, "Use for experimentation and testing") \ + _(EXP2 , 254, "Use for experimentation and testing") + +/* + + "No Next Header" is not a header. + Also, Fragment header needs special processing. + + _(NONEXT , 59 , "NoNextHdr") \ + + +ESP is hiding its internal format, so no point in trying to go past it. + + _(ESP , 50 , "EncapsulatingSecurityPayload") \ + + +AH has a special treatment of its length, it is in 32-bit words, not 64-bit words like the rest. + + _(AUTH , 51 , "Authentication Header") \ + + +*/ + + + typedef enum { + #define _(N, v, s) ACL_EH_##N = v, + foreach_acl_eh + #undef _ + } acl_eh_t; + + + +extern acl_main_t acl_main; + + +#endif diff --git a/src/plugins/acl/acl_all_api_h.h b/src/plugins/acl/acl_all_api_h.h new file mode 100644 index 00000000..cb781cfd --- /dev/null +++ b/src/plugins/acl/acl_all_api_h.h @@ -0,0 +1,21 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* Include the generated file, see BUILT_SOURCES in Makefile.am */ +#include <acl/acl.api.h> + +#ifdef vl_printfun +#include <acl/manual_fns.h> +#endif + diff --git a/src/plugins/acl/acl_hash_lookup_doc.md b/src/plugins/acl/acl_hash_lookup_doc.md new file mode 100644 index 00000000..cb93df04 --- /dev/null +++ b/src/plugins/acl/acl_hash_lookup_doc.md @@ -0,0 +1,241 @@ +ACL plugin constant-time lookup design {#acl_hash_lookup} +====================================== + +The initial implementation of ACL plugin performs a trivial for() cycle, +going through the assigned ACLs on a per-packet basis. This is not very +efficient, even if for very short ACLs due to its simplicity it can beat +more advanced methods. + +However, to cover the case of longer ACLs with acceptable performance, +we need to have a better way of matching. This write-up proposes +a mechanism to make a lookup from O(M) where M is number of entries +to O(N) where N is number of different mask combinations. + +Preparation of ACL(s) +--------------------- + +The ACL plugin will maintain a global list of "mask types", i.e. the specific +configurations of "do not care" bits within the ACEs. +Upon the creation of a new ACL, a pass will be made through all the +ACEs, to assign and possibly allocate the "mask type number". + +Each ACL has a structure *hash_acl_info_t* representing the "hash-based" +parts of information related to that ACL, primarily the array of +*hash_ace_info_t* structures - each of the members of that array +corresponding to one of the rules (ACEs) in the original ACL, +for this they have a pair of *(acl_index, ace_index)* to keep track, +predominantly for the debugging. + +Why do we need a whole separate structure, and are not adding new fields +to the existing rile structure ? First, encapsulation, to minimize +the pollution of the main ACL code with the hash-based lookup artifacts. + +Second, one rule may correspond to more than one "hash-based" ACE. +In fact, most of the rules do correspond to two of those. Why ? + +Consider that the current ACL lookup logic is that if a packet +is not the initial fragment, and there is an L4 entry acting on the packet, +the comparison will be made only on the L4 protocol field value rather +than on the protocol and port values. This beaviour is governed by +*l4_match_nonfirst_fragment* flag in the *acl_main*, and was needed to +maintain the compatibility with the existing software switch implementation. + +While for the sequential check in *single_acl_match_5tuple()* +it is very easy to implement by just breaking out at the right moment, +in case of hash-based matching this cost us two checks: +one on full 5-tuple and the flag *pkt.is_nonfirst_fragment* being zero, +the second on 3-tuple and the flag *pkt.is_nonfirst_fragment* being one, +with the second check triggered by the *acl_main.l4_match_nonfirst_fragment* +setting being the default 1. This dictates the necessity of having a "match" +field in a given *hash_ace_info_t* element, which would reflect the value +we are supposed to match after applying the mask. + +There can be other circumstances when it might be beneficial to expand +the given rule in the original ACL into multiple - for example, as an +optimization within the port range handling for small port ranges +(this is not done as of the time of writing). + +Assigning ACLs to an interface +------------------------------ + +Once the ACL list is assigned to an interface, or, rather, a new ACL +is added to the list of the existing ACLs applied to the interface, +we need to update the bihash accelerating the lookup. + +All the entries for the lookups are stored within a single *48_8* bihash, +which captures the 5-tuple from the packet as well as the miscellaneous +per-packet information flags, e.g. *l4_valid*, *is_non_first_fragment*, +and so on. To facilitate the use of the single bihash by all the interfaces, +the *is_ip6*, *is_input*, *sw_if_index* are part of the key, +as well as *mask_type_index* - the latter being necessary because +there can be entries with the same value but different masks, e.g.: +`permit ::/0, permit::/128`. + +At the moment of an ACL being applied to an interface, we need to +walk the list of *hash_ace_info_t* entries corresponding to that ACL, +and update the bihash with the keys corresponding to the match +values in these entries. + +The value of the hash match contains the index into a per-*sw_if_index* vector +of *applied_ace_hash_entry_t* elements, as well as a couple of flags: +*shadowed* (optimization: if this flag on a matched entry is zero, means +we can stop the lookup early and declare a match - see below), +and *need_portrange_check* - meaning that what matched was a superset +of the actual match, and we need to perform an extra check. + +Also, upon insertion, we must keep in mind there can be +multiple *applied_ace_hash_entry_t* for the same key and must keep +a list of those. This is necessary to incrementally apply/unapply +the ACLs as part of the ACL vector: say, two ACLs have +"permit 2001:db8::1/128 any" - we should be able to retain the entry +for the second ACL even if we have deleted the first one. +Also, in case there are two entries with the same key but +different port ranges, say 0..42 and 142..65535 - we need +to be able to sequentially match on those if we decide not +to expand them into individual port-specific entries. + +Per-packet lookup +----------------- + +The simple single-packet lookup is defined in +*multi_acl_match_get_applied_ace_index*, which returns the index +of the applied hash ACE if there was a match, or ~0 if there wasn't. + +The future optimized per-packet lookup may be batched in three phases: + +1. Prepare the keys in the per-worker vector by doing logical AND of + original 5-tuple record with the elements of the mask vector. +2. Lookup the keys in the bihash in a batch manner, collecting the + result with lowest u64 (acl index within vector, ACE index) from + the hash lookup value, and performing the list walk if necessary + (for portranges) +3. Take the action from the ACL record as defined by (ACL#, ACE#) from the + resulting lookup winner, or, if no match found, then perform default deny. + +Shadowed/independent/redundant ACEs +------------------------------------ + +During the phase of combining multiple ACLs into one rulebase, when they +are applied to interface, we also can perform several optimizations. + +If a given ACE is a strict subset of another ACE located up in the linear +search order, we can ignore this ACE completely - because by definition +it will never match. We will call such an ACE *redundant*. Here is an example: + +``` +permit 2001:db8:1::/48 2001:db8:2::/48 (B) +deny 2001:d8b:1:1::/64 2001:db8:2:1::/64 (A) +``` + +A bit more formally, we can define this relationship of an ACE A to ACE B as: + +``` +redundant(aceA, aceB) := (contains(protoB, protoA) && contains(srcB, srcA) + && contains(dstB, dstA) && is_after(A, B)) +``` + +Here as "contains" we define an operation operating on the sets defined by +the protocol, (srcIP, srcPortDefinition) and (dstIP, dstPortDefinition) +respectively, and returning true if all the elements represented by +the second argument are represented by the first argument. The "is_after" +is true if A is located below B in the ruleset. + +If a given ACE does not intersect at all with any other ACE +in front of it, we can mark it as such. + +Then during the sequence of the lookups the successful hit on this ACE means +we do not need to look up other mask combinations - thus potentially +significantly speeding up the match process. Here is an example, +assuming we have the following ACL: + +``` +permit 2001:db8:1::/48 2001:db8:2::/48 (B) +deny 2001:db8:3::/48 2001:db8:2:1::/64 (A) +``` + +In this case if we match the second entry, we do not need to check whether +we have matched the first one - the source addresses are completely +different. We call such an ACE *independent* from another. + +We can define this as + +``` +independent(aceA, aceB) := (!intersect(protoA, protoB) || + !intersect(srcA, srcB) || + !intersect(dstA, dstB)) +``` + +where intersect is defined as operation returning true if there are +elements belonging to the sets of both arguments. + +If the entry A is neither redundant nor independent from B, and is below +B in the ruleset, we call such an entry *shadowed* by B, here is an example: + +``` +deny tcp 2001:db8:1::/48 2001:db8:2::/48 (B) +permit 2001:d8b:1:1::/64 2001:db8:2:1::/64 (A) +``` + +This means the earlier rule "carves out" a subset of A, thus leaving +a "shadow". (Evidently, the action needs to be different for the shadow +to have an effect, but for for the terminology sake we do not care). + +The more formal definition: + +``` +shadowed(aceA, aceB) := !redundante(aceA, aceB) && + !independent(aceA, aceB) && + is_after(aceA, aceB) +``` + +Using this terminology, any ruleset can be represented as +a DAG (Directed Acyclic Graph), with the bottom being the implicit +"deny any", pointing to the set of rules shadowing it or the ones +it is redundant for. + +These rules may in turn be shadowing each other. There is no cycles in +this graph because of the natural order of the rules - the rule located +closer to the end of the ruleset can never shadow or make redundant a rule +higher up. + +The optimization that enables can allow for is to skip matching certain +masks on a per-lookup basis - if a given rule has matched, +the only adjustments that can happen is the match with one of +the shadowing rules. + +Also, another avenue for the optimization can be starting the lookup process +with the mask type that maximizes the chances of the independent ACE match, +thus resulting in an ACE lookup being a single hash table hit. + + +Plumbing +-------- + +All the new routines are located in a separate file, +so we can cleanly experiment with a different approach if this +does not fit all of the use cases. + +The constant-time lookup within the data path has the API with +the same signature as: + +``` +u8 +multi_acl_match_5tuple (u32 sw_if_index, fa_5tuple_t * pkt_5tuple, int is_l2, + int is_ip6, int is_input, u32 * acl_match_p, + u32 * rule_match_p, u32 * trace_bitmap) +``` + +There should be a new upper-level function with the same signature, which +will make a decision whether to use a linear lookup, or to use the +constant-time lookup implemented by this work, or to add some other +optimizations (e.g. by keeping the cache of the last N lookups). + +The calls to the routine doing preparatory work should happen +in `acl_add_list()` after creating the linear-lookup structures, +and the routine doing the preparatory work populating the hashtable +should be called from `acl_interface_add_del_inout_acl()` or its callees. + +The initial implementation will be geared towards looking up a single +match at a time, with the subsequent optimizations possible to make +the lookup for more than one packet. + diff --git a/src/plugins/acl/acl_msg_enum.h b/src/plugins/acl/acl_msg_enum.h new file mode 100644 index 00000000..14d8b48c --- /dev/null +++ b/src/plugins/acl/acl_msg_enum.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef included_acl_msg_enum_h +#define included_acl_msg_enum_h + +#include <vppinfra/byte_order.h> + +#define vl_msg_id(n,h) n, +typedef enum { +#include <acl/acl_all_api_h.h> + /* We'll want to know how many messages IDs we need... */ + VL_MSG_FIRST_AVAILABLE, +} vl_msg_id_t; +#undef vl_msg_id + +#endif diff --git a/src/plugins/acl/acl_multicore_doc.md b/src/plugins/acl/acl_multicore_doc.md new file mode 100644 index 00000000..b2cf7b9c --- /dev/null +++ b/src/plugins/acl/acl_multicore_doc.md @@ -0,0 +1,349 @@ +Multicore support for ACL plugin {#acl_multicore} +================================ + +This captures some considerations and design decisions that I have made, +both for my own memory later on ("what the hell was I thinking?!?"), +and for anyone interested to criticize/improve/hack on this code. + +One of the factors taken into account while making these decisions, +was the relative emphasis on the multi-thread vs. single-thread +use cases: the latter is the vastly more prevalent. But, +one can not optimize the single-thread performance without +having a functioning code for multi-thread. + +stateless ACLs +============== + +The stateless trivially parallelizes, and the only potential for the +race between the different threads is during the reconfiguration, +at the time of replacing the old ACL being checked, with +the new ACL. + +In case an acl_add_replace is being used to replace the rules +within the existing entry, a reallocation of `am->acls[X].rules` +vector will happen and potentially a change in count. + +acl_match_5tuple() has the following code: + +```{.c} + a = am->acls + acl_index; + for (i = 0; i < a->count; i++) + { + r = a->rules + i; + . . . +``` + +Ideally we should be immune from a->rules changing, +but the problem arises if the count changes in flight, +and the new ruleset is smaller - then we will attempt +to "match" against the free memory. + +This can(?) be solved by replacing the for() with while(), +so the comparison happens at each iteration. + +full_acl_match_5tuple(), which iterates over the list +of ACLs, is a bit less immune, since it takes the pointer +to the vector to iterate and keeps a local copy of +that pointer. + +This race can be solved by checking the +current pointer to the vector with the source pointer, +and seeing if there is an (unlikely) change, and if +there is, return the "deny" action, or, better, +restart the check. + +Since the check reloads the ACL list on a per-packet basis, +there is only a window of opportunity of one packet to +"match" packet against an incorrect rule set. +The workers also do not change anything, only read. +Therefore, it looks like building special structures +to ensure that it does not happen at all might be not +worth it. + +At least not until we have a unit-test able to +reliably catch this condition and test that +the measures applied are effective. Adding the code +which is not possible to exercise is worse than +not adding any code at all. + +So, I opt for "do-nothing" here for the moment. + +reflexive ACLs: single-thread +============================= + +Before we talk multi-thread, is worth revisiting the +design of the reflexive ACLs in the plugin, and +the history of their evolution. + +The very first version of the ACL plugin, shipped in +1701, mostly did the job using the existing components +and gluing them together. Because it needed to work +in bridged forwarding path only, using L2 classifier +as an insertion point appeared natural, also L2 classifier, +being a table with sessions, seemed like a good place +to hold the sessions. + +So, the original design had two conceptual nodes: +one, pointed by the next_miss from the L2 classifier table, +was checking the actual ACL, and inserting session into +the L2 classifier table, and the other one, pointed +to by the next_match within the specific session rule, +was checking the existing session. The timing out +of the existing connections was done in the datapath, +by periodically calling the aging function. + +This decision to use the existing components, +with its attrativeness, did bring a few limitations as well: + +* L2 classifier is a simple mask-and-value match, with +a fixed mask across the table. So, sanely supporting IPv6 +packets with extension headers in that framework was impossible. + +* There is no way to get a backpressure from L2 classifier +depending on memory usage. When it runs out of memory, +it simply crashes the box. When it runs out of memory ? +We don't really know. Depends on how it allocates it. + +* Since we need to match the *reflected* traffic, +we had to create *two* full session entries +in two different directions, which is quite wasteful memory-wise. + +* (showstopper): the L2 classifier runs only in +the bridged data path, so supporting routed data path +would require creating something else entirely different, +which would mean much more headaches support-wise going forward. + +Because of that, I have moved to a different model of +creating a session-5-tuple from the packet data - once, +and then doing all the matching just on that 5-tuple. + +This has allowed to add support for skipping IPv6 extension headers. + +Also, this new version started to store the sessions in a dedicated +bihash-per-interface, with the session key data being +aligned for the ingress packets, and being mirrored for the +egress packets. This allows of significant savings in memory, +because now we need to keep only one copy of the session table per +interface instead of two, and also to only have ONE node for all the lookups, +(L2/L3 path, in/out, IPv4/IPv6) - significantly reducing the code complexity. + +Unfortunately, bihash still has the "lack of backpressure" problem, +in a sense that if you try to insert too many entries and run out +of memory in the heap you supplied, you get a crash. + +To somewhat workaround against that, there is a "maximum tested number of sessions" +value, which tracks the currently inserted sessions in the bihash, +and if this number is being approached, a more aggressive cleanup +can happen. If this number is reached, two behaviors are possible: + +* attempt to do the stateless ACL matching and permit the packet + if it succeeds + +* deny the packet + +Currently I have opted for a second one, since it allows for +a better defined behavior, and if you have to permit +the traffic in both directions, why using stateful anyway ? + +In order to be able to do the cleanup, we need to discriminate between +the session types, with each session type having its own idle timeout. +In order to do that, we keep three lists, defined in enum acl_timeout_e: +ACL_TIMEOUT_UDP_IDLE, ACL_TIMEOUT_TCP_IDLE, ACL_TIMEOUT_TCP_TRANSIENT. + +The first one is hopefully obvious - it is just all UDP connections. +They have an idle timeout of 600 seconds. + +The second and third is a bit more subtle. TCP is a complicated protocol, +and we need to tread the fine line between doing too little and doing +too much, and triggering the potential compatibility issues because of +being a "middlebox". + +I decided to split the TCP connections into two classes: +established, and everything else. "Established", means we have seen +the SYN and ACK from both sides (with PUSH obviously masked out). +This is the "active" state of any TCP connection and we would like +to ensure we do not screw it up. So, the connections in this state +have the default idle timer of 24 hours. + +All the rest of the connections have the idle timeout of 2 minutes, +(inspired by an old value of MSL) and based on the observation +that the states this class represent are usually very short lived. + +Once we have these three baskets of connections, it is trivial to +imagine a simple cleanup mechanism to deal with this: take a +TCP transient connection that has been hanging around. + +It is debatable whether we want to do discrimination between the +different TCP transient connections. Assuming we do FIFO (and +the lists allow us to do just that), it means a given connection +on the head of the list has been hanging around for longest. +Thus, if we are short on resources, we might just go ahead and +reuse it within the datapath. + +This is where we are slowly approaching the question +"Why in the world have not you used timer wheel or such ?" + +The answer is simple: within the above constraints, it does +not buy me much. + +Also, timer wheel creates a leaky abstraction with a difficult +to manage corner case. Which corner case ? + +We have a set of objects (sessions) with an event that may +or may not happen (idle timeout timer firing), and a +necessity to reset the idle timeout when there is +activity on the session. + +In the worst case, where we had a 10000 of one-packet +UDP sessions just created 10 minutes ago, we would need +to deal with a spike of 10000 expired timers. + +Of course, if we have the active traffic on all +of these 10000 connections, then we will not have +to deal with that ? Right, but we will still have to deal +with canceling and requeueing the timers. + +In the best possible case, requeueing a timer is +going to be something along the lines of a linked-list +removal and reinsertion. + +However, keep in mind we already need to classify the +connections for reuse, so therefore we already have +the linked lists! + +And if we just check these linked lists periodically in +a FIFO fashion, we can get away with a very simple per-packet operation: +writing back the timestamp of "now" into the connection structure. + +Then rather than requeueing the list on a per-packet or per-frame +basis, we can defer this action until the time this session +appears on the head of the FIFO list, and the cleaning +routine makes the decision about whether to discard +the session (because the interval since last activity is bigger +than the idle timeout), or to requeue the session back to +the end of the list (because the last activity was less +than idle timeout ago). + +So, rather than using the timers, we can simply reuse our classification +FIFOs, with the following heuristic: do not look at the session that was +enqueued at time X until X+session_timeout. If we enqueue the sessions +in the order of their initial activity, then we can simply use enqueue +timestamp of the head session as a decision criterion for when we need +to get back at looking at it for the timeout purposes. + +Since the number of FIFOs is small, we get a slightly worse check +performance than with timers, but still O(1). + +We seemingly do quite a few "useless" operations of requeueing the items +back to the tail of the list - but, these are the operations we do not +have to do in the active data path, so overall it is a win. + +(Diversion: I believe this problem is congruent to poll vs. epoll or +events vs. threads, some reading on this subject: +http://web.archive.org/web/20120225022154/http://sheddingbikes.com/posts/1280829388.html) + +We can also can run a TCP-like scheme for adaptively changing +the wait period in the routine that deals with the connection timeouts: +we can attempt to check the connections a couple of times per second +(same as we would advance the timer wheel), and then if we have requeued +close to a max-per-quantum number of connections, we can half the waiting +interval, and if we did not requeue any, we can slowly increment the waiting +interval - which at a steady state should stabilize similar to what the TCP rate +does. + +reflexive ACLs: multi-thread +============================= + +The single-threaded implementation in 1704 used a separate "cleaner" process +to deal with the timing out of the connections. +It is all good and great when you know that there is only a single core +to run everything on, but the existence of the lists proves to be +a massive difficulty when it comes to operating from multiple threads. + +Initial study shows that with a few assumptions (e.g. that the cleaner running in main thread +and the worker have a demarcation point in time where either one or the other one touches +the session in the list) it might be possible to make it work, but the resulting +trickiness of doing it neatly with all the corner cases is quite large. + +So, for the multi-threaded scenario, we need to move the connection +aging back to the same CPU as its creation. + +Luckily we can do this with the help of the interrupts. + +So, the design is as follows: the aging thread (acl_fa_session_cleaner_process) +periodically fires the interrupts to the workers interrupt nodes (acl_fa_worker_session_cleaner_process_node.index), +using vlib_node_set_interrupt_pending(), and +the interrupt node acl_fa_worker_conn_cleaner_process() calls acl_fa_check_idle_sessions() +which does the actual job of advancing the lists. And within the actual datapath the only thing we will be +doing is putting the items onto FIFO, and updating the last active time on the existing connection. + +The one "delicate" part is that the worker for one leg of the connection might be different from +the worker of another leg of the connection - but, even if the "owner" tries to free the connection, +nothing terrible can happen - worst case the element of the pool (which is nominally free for a short period) +will get the timestamp updated - same thing about the TCP flags seen. + +A slightly trickier issue arises when the packet initially seen by one worker (thus owned by that worker), +and the return packet processed by another worker, and as a result changes the +the class of the connection (e.g. becomes TCP_ESTABLISHED from TCP_TRANSIENT or vice versa). +If the class changes from one with the shorter idle time to the one with the longer idle time, +then unless we are in the starvation mode where the transient connections are recycled, +we can simply do nothing and let the normal requeue mechanism kick in. If the class changes from the longer idle +timer to the shorter idle timer, then we risk keeping the connection around for longer than needed, which +will affect the resource usage. + +One solution to that is to have NxN ring buffers (where N is the number of workers), such that the non-owner +can signal to the owner the connection# that needs to be requeued out of order. + +A simpler solution though, is to ensure that each FIFO's period is equal to that of a shortest timer. +This way the resource starvation problem is taken care of, at an expense of some additional work. + +This all looks sufficiently nice and simple until a skeleton falls out of the closet: +sometimes we want to clean the connections en masse before they expire. + +There few potential scenarios: +1) removal of an ACL from the interface +2) removal of an interface +3) manual action of an operator (in the future). + +In order to tackle this, we need to modify the logic which decides whether to requeue the +connection on the end of the list, or to delete it due to idle timeout: + +We define a point in time, and have each worker thread fast-forward through its FIFO, +in the process looking for sessions that satisfy the criteria, and either keeping them or requeueing them. + +To keep the ease of appearance to the outside world, we still process this as an event +within the connection cleaner thread, but this event handler does as follows: +1) it creates the bitmap of the sw_if_index values requested to be cleared +2) for each worker, it waits to ensure there is no cleanup operation in progress (and if there is one, +it waits), and then makes a copy of the bitmap, sets the per-worker flag of a cleanup operation, and sends an interrupt. +3) wait until all cleanup operations have completed. + +Within the worker interrupt node, we check if the "cleanup in progress" is set, +and if it is, we check the "fast forward time" value. If unset, we initialize it to value now, and compare the +requested bitmap of sw_if_index values (pending_clear_sw_if_index_bitmap) with the bitmap of sw_if_index that this worker deals with. + +(we set the bit in the bitmap every time we enqueue the packet onto a FIFO - serviced_sw_if_index_bitmap in acl_fa_conn_list_add_session). + +If the result of this AND operation is zero - then we can clear the flag of cleanup in progress and return. +Else we kick off the quantum of cleanup, and make sure we get another interrupt ASAP if that cleanup operation returns non-zero, +meaning there is more work to do. +When that operation returns zero, everything has been processed, we can clear the "cleanup-in-progress" flag, and +zeroize the bitmap of sw_if_index-es requested to be cleaned. + +The interrupt node signals its wish to receive an interrupt ASAP by setting interrupt_is_needed +flag within the per-worker structure. The main thread, while waiting for the +cleanup operation to complete, checks if there is a request for interrupt, +and if there is - it sends one. + +This approach gives us a way to mass-clean the connections which is reusing the code of the regular idle +connection cleanup. + +One potential inefficiency is the bitmap values set by the session insertion +in the data path - there is nothing to clear them. + +So, if one rearranges the interface placement with the workers, then the cleanups will cause some unnecessary work. +For now, we consider it an acceptable limitation. It can be resolved by having another per-worker bitmap, which, when set, +would trigger the cleanup of the bits in the serviced_sw_if_index_bitmap). + +=== the end === + diff --git a/src/plugins/acl/acl_test.c b/src/plugins/acl/acl_test.c new file mode 100644 index 00000000..abb9643e --- /dev/null +++ b/src/plugins/acl/acl_test.c @@ -0,0 +1,1219 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + *------------------------------------------------------------------ + * acl_test.c - test harness plugin + *------------------------------------------------------------------ + */ + +#include <vat/vat.h> +#include <vlibapi/api.h> +#include <vlibmemory/api.h> +#include <vlibsocket/api.h> +#include <vppinfra/error.h> +#include <vnet/ip/ip.h> +#include <arpa/inet.h> + +#define __plugin_msg_base acl_test_main.msg_id_base +#include <vlibapi/vat_helper_macros.h> + +uword unformat_sw_if_index (unformat_input_t * input, va_list * args); + +/* Declare message IDs */ +#include <acl/acl_msg_enum.h> + +/* define message structures */ +#define vl_typedefs +#include <acl/acl_all_api_h.h> +#undef vl_typedefs + +/* define message structures */ +#define vl_endianfun +#include <acl/acl_all_api_h.h> +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) +#define vl_printfun +#include <acl/acl_all_api_h.h> +#undef vl_printfun + +/* Get the API version number. */ +#define vl_api_version(n,v) static u32 api_version=(v); +#include <acl/acl_all_api_h.h> +#undef vl_api_version + +typedef struct { + /* API message ID base */ + u16 msg_id_base; + vat_main_t *vat_main; +} acl_test_main_t; + +acl_test_main_t acl_test_main; + +#define foreach_standard_reply_retval_handler \ +_(acl_del_reply) \ +_(acl_interface_add_del_reply) \ +_(macip_acl_interface_add_del_reply) \ +_(acl_interface_set_acl_list_reply) \ +_(macip_acl_del_reply) + +#define foreach_reply_retval_aclindex_handler \ +_(acl_add_replace_reply) \ +_(macip_acl_add_reply) \ +_(macip_acl_add_replace_reply) + +#define _(n) \ + static void vl_api_##n##_t_handler \ + (vl_api_##n##_t * mp) \ + { \ + vat_main_t * vam = acl_test_main.vat_main; \ + i32 retval = ntohl(mp->retval); \ + if (vam->async_mode) { \ + vam->async_errors += (retval < 0); \ + } else { \ + vam->retval = retval; \ + vam->result_ready = 1; \ + } \ + } +foreach_standard_reply_retval_handler; +#undef _ + +#define _(n) \ + static void vl_api_##n##_t_handler \ + (vl_api_##n##_t * mp) \ + { \ + vat_main_t * vam = acl_test_main.vat_main; \ + i32 retval = ntohl(mp->retval); \ + if (vam->async_mode) { \ + vam->async_errors += (retval < 0); \ + } else { \ + clib_warning("ACL index: %d", ntohl(mp->acl_index)); \ + vam->retval = retval; \ + vam->result_ready = 1; \ + } \ + } +foreach_reply_retval_aclindex_handler; +#undef _ + +/* These two ought to be in a library somewhere but they aren't */ +static uword +my_unformat_mac_address (unformat_input_t * input, va_list * args) +{ + u8 *a = va_arg (*args, u8 *); + return unformat (input, "%x:%x:%x:%x:%x:%x", &a[0], &a[1], &a[2], &a[3], + &a[4], &a[5]); +} + +static u8 * +my_format_mac_address (u8 * s, va_list * args) +{ + u8 *a = va_arg (*args, u8 *); + return format (s, "%02x:%02x:%02x:%02x:%02x:%02x", + a[0], a[1], a[2], a[3], a[4], a[5]); +} + + + +static void vl_api_acl_plugin_get_version_reply_t_handler + (vl_api_acl_plugin_get_version_reply_t * mp) + { + vat_main_t * vam = acl_test_main.vat_main; + clib_warning("ACL plugin version: %d.%d", ntohl(mp->major), ntohl(mp->minor)); + vam->result_ready = 1; + } + +static void vl_api_acl_interface_list_details_t_handler + (vl_api_acl_interface_list_details_t * mp) + { + int i; + vat_main_t * vam = acl_test_main.vat_main; + u8 *out = 0; + vl_api_acl_interface_list_details_t_endian(mp); + out = format(out, "sw_if_index: %d, count: %d, n_input: %d\n", mp->sw_if_index, mp->count, mp->n_input); + out = format(out, " input "); + for(i=0; i<mp->count; i++) { + if (i == mp->n_input) + out = format(out, "\n output "); + out = format(out, "%d ", ntohl (mp->acls[i])); + } + out = format(out, "\n"); + clib_warning("%s", out); + vec_free(out); + vam->result_ready = 1; + } + + +static inline u8 * +vl_api_acl_rule_t_pretty_format (u8 *out, vl_api_acl_rule_t * a) +{ + int af = a->is_ipv6 ? AF_INET6 : AF_INET; + u8 src[INET6_ADDRSTRLEN]; + u8 dst[INET6_ADDRSTRLEN]; + inet_ntop(af, a->src_ip_addr, (void *)src, sizeof(src)); + inet_ntop(af, a->dst_ip_addr, (void *)dst, sizeof(dst)); + + out = format(out, "%s action %d src %s/%d dst %s/%d proto %d sport %d-%d dport %d-%d tcpflags %d mask %d", + a->is_ipv6 ? "ipv6" : "ipv4", a->is_permit, + src, a->src_ip_prefix_len, + dst, a->dst_ip_prefix_len, + a->proto, + a->srcport_or_icmptype_first, a->srcport_or_icmptype_last, + a->dstport_or_icmpcode_first, a->dstport_or_icmpcode_last, + a->tcp_flags_value, a->tcp_flags_mask); + return(out); +} + + + +static void vl_api_acl_details_t_handler + (vl_api_acl_details_t * mp) + { + int i; + vat_main_t * vam = acl_test_main.vat_main; + vl_api_acl_details_t_endian(mp); + u8 *out = 0; + out = format(0, "acl_index: %d, count: %d\n tag {%s}\n", mp->acl_index, mp->count, mp->tag); + for(i=0; i<mp->count; i++) { + out = format(out, " "); + out = vl_api_acl_rule_t_pretty_format(out, &mp->r[i]); + out = format(out, "%s\n", i<mp->count-1 ? "," : ""); + } + clib_warning("%s", out); + vec_free(out); + vam->result_ready = 1; + } + +static inline u8 * +vl_api_macip_acl_rule_t_pretty_format (u8 *out, vl_api_macip_acl_rule_t * a) +{ + int af = a->is_ipv6 ? AF_INET6 : AF_INET; + u8 src[INET6_ADDRSTRLEN]; + inet_ntop(af, a->src_ip_addr, (void *)src, sizeof(src)); + + out = format(out, "%s action %d ip %s/%d mac %U mask %U", + a->is_ipv6 ? "ipv6" : "ipv4", a->is_permit, + src, a->src_ip_prefix_len, + my_format_mac_address, a->src_mac, + my_format_mac_address, a->src_mac_mask); + return(out); +} + + +static void vl_api_macip_acl_details_t_handler + (vl_api_macip_acl_details_t * mp) + { + int i; + vat_main_t * vam = acl_test_main.vat_main; + vl_api_macip_acl_details_t_endian(mp); + u8 *out = format(0,"MACIP acl_index: %d, count: %d\n tag {%s}\n", mp->acl_index, mp->count, mp->tag); + for(i=0; i<mp->count; i++) { + out = format(out, " "); + out = vl_api_macip_acl_rule_t_pretty_format(out, &mp->r[i]); + out = format(out, "%s\n", i<mp->count-1 ? "," : ""); + } + clib_warning("%s", out); + vec_free(out); + vam->result_ready = 1; + } + +static void vl_api_macip_acl_interface_get_reply_t_handler + (vl_api_macip_acl_interface_get_reply_t * mp) + { + int i; + vat_main_t * vam = acl_test_main.vat_main; + u8 *out = format(0, "sw_if_index with MACIP ACL count: %d\n", ntohl(mp->count)); + for(i=0; i<ntohl(mp->count); i++) { + out = format(out, " macip_acl_interface_add_del sw_if_index %d add acl %d\n", i, ntohl(mp->acls[i])); + } + out = format(out, "\n"); + clib_warning("%s", out); + vec_free(out); + vam->result_ready = 1; + } + +static void vl_api_acl_plugin_control_ping_reply_t_handler + (vl_api_acl_plugin_control_ping_reply_t * mp) +{ + vat_main_t *vam = &vat_main; + i32 retval = ntohl (mp->retval); + if (vam->async_mode) + { + vam->async_errors += (retval < 0); + } + else + { + vam->retval = retval; + vam->result_ready = 1; + } +} + + +/* + * Table of message reply handlers, must include boilerplate handlers + * we just generated + */ +#define foreach_vpe_api_reply_msg \ +_(ACL_ADD_REPLACE_REPLY, acl_add_replace_reply) \ +_(ACL_DEL_REPLY, acl_del_reply) \ +_(ACL_INTERFACE_ADD_DEL_REPLY, acl_interface_add_del_reply) \ +_(ACL_INTERFACE_SET_ACL_LIST_REPLY, acl_interface_set_acl_list_reply) \ +_(ACL_INTERFACE_LIST_DETAILS, acl_interface_list_details) \ +_(ACL_DETAILS, acl_details) \ +_(MACIP_ACL_ADD_REPLY, macip_acl_add_reply) \ +_(MACIP_ACL_ADD_REPLACE_REPLY, macip_acl_add_replace_reply) \ +_(MACIP_ACL_DEL_REPLY, macip_acl_del_reply) \ +_(MACIP_ACL_DETAILS, macip_acl_details) \ +_(MACIP_ACL_INTERFACE_ADD_DEL_REPLY, macip_acl_interface_add_del_reply) \ +_(MACIP_ACL_INTERFACE_GET_REPLY, macip_acl_interface_get_reply) \ +_(ACL_PLUGIN_CONTROL_PING_REPLY, acl_plugin_control_ping_reply) \ +_(ACL_PLUGIN_GET_VERSION_REPLY, acl_plugin_get_version_reply) + +static int api_acl_plugin_get_version (vat_main_t * vam) +{ + acl_test_main_t * sm = &acl_test_main; + vl_api_acl_plugin_get_version_t * mp; + u32 msg_size = sizeof(*mp); + int ret; + + vam->result_ready = 0; + mp = vl_msg_api_alloc_as_if_client(msg_size); + memset (mp, 0, msg_size); + mp->_vl_msg_id = ntohs (VL_API_ACL_PLUGIN_GET_VERSION + sm->msg_id_base); + mp->client_index = vam->my_client_index; + + /* send it... */ + S(mp); + + /* Wait for a reply... */ + W (ret); + return ret; +} + +static int api_macip_acl_interface_get (vat_main_t * vam) +{ + acl_test_main_t * sm = &acl_test_main; + vl_api_acl_plugin_get_version_t * mp; + u32 msg_size = sizeof(*mp); + int ret; + + vam->result_ready = 0; + mp = vl_msg_api_alloc_as_if_client(msg_size); + memset (mp, 0, msg_size); + mp->_vl_msg_id = ntohs (VL_API_MACIP_ACL_INTERFACE_GET + sm->msg_id_base); + mp->client_index = vam->my_client_index; + + /* send it... */ + S(mp); + + /* Wait for a reply... */ + W (ret); + return ret; +} + +#define vec_validate_acl_rules(v, idx) \ + do { \ + if (vec_len(v) < idx+1) { \ + vec_validate(v, idx); \ + v[idx].is_permit = 0x1; \ + v[idx].srcport_or_icmptype_last = 0xffff; \ + v[idx].dstport_or_icmpcode_last = 0xffff; \ + } \ + } while (0) + + +static int api_acl_add_replace (vat_main_t * vam) +{ + acl_test_main_t * sm = &acl_test_main; + unformat_input_t * i = vam->input; + vl_api_acl_add_replace_t * mp; + u32 acl_index = ~0; + u32 msg_size = sizeof (*mp); /* without the rules */ + + vl_api_acl_rule_t *rules = 0; + int rule_idx = 0; + int n_rules = 0; + int n_rules_override = -1; + u32 proto = 0; + u32 port1 = 0; + u32 port2 = 0; + u32 action = 0; + u32 tcpflags, tcpmask; + u32 src_prefix_length = 0, dst_prefix_length = 0; + ip4_address_t src_v4address, dst_v4address; + ip6_address_t src_v6address, dst_v6address; + u8 *tag = 0; + int ret; + + if (!unformat (i, "%d", &acl_index)) { + /* Just assume -1 */ + } + + while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT) + { + if (unformat (i, "ipv6")) + { + vec_validate_acl_rules(rules, rule_idx); + rules[rule_idx].is_ipv6 = 1; + } + else if (unformat (i, "ipv4")) + { + vec_validate_acl_rules(rules, rule_idx); + rules[rule_idx].is_ipv6 = 0; + } + else if (unformat (i, "permit+reflect")) + { + vec_validate_acl_rules(rules, rule_idx); + rules[rule_idx].is_permit = 2; + } + else if (unformat (i, "permit")) + { + vec_validate_acl_rules(rules, rule_idx); + rules[rule_idx].is_permit = 1; + } + else if (unformat (i, "deny")) + { + vec_validate_acl_rules(rules, rule_idx); + rules[rule_idx].is_permit = 0; + } + else if (unformat (i, "count %d", &n_rules_override)) + { + /* we will use this later */ + } + else if (unformat (i, "action %d", &action)) + { + vec_validate_acl_rules(rules, rule_idx); + rules[rule_idx].is_permit = action; + } + else if (unformat (i, "src %U/%d", + unformat_ip4_address, &src_v4address, &src_prefix_length)) + { + vec_validate_acl_rules(rules, rule_idx); + memcpy (rules[rule_idx].src_ip_addr, &src_v4address, 4); + rules[rule_idx].src_ip_prefix_len = src_prefix_length; + rules[rule_idx].is_ipv6 = 0; + } + else if (unformat (i, "src %U/%d", + unformat_ip6_address, &src_v6address, &src_prefix_length)) + { + vec_validate_acl_rules(rules, rule_idx); + memcpy (rules[rule_idx].src_ip_addr, &src_v6address, 16); + rules[rule_idx].src_ip_prefix_len = src_prefix_length; + rules[rule_idx].is_ipv6 = 1; + } + else if (unformat (i, "dst %U/%d", + unformat_ip4_address, &dst_v4address, &dst_prefix_length)) + { + vec_validate_acl_rules(rules, rule_idx); + memcpy (rules[rule_idx].dst_ip_addr, &dst_v4address, 4); + rules[rule_idx].dst_ip_prefix_len = dst_prefix_length; + rules[rule_idx].is_ipv6 = 0; + } + else if (unformat (i, "dst %U/%d", + unformat_ip6_address, &dst_v6address, &dst_prefix_length)) + { + vec_validate_acl_rules(rules, rule_idx); + memcpy (rules[rule_idx].dst_ip_addr, &dst_v6address, 16); + rules[rule_idx].dst_ip_prefix_len = dst_prefix_length; + rules[rule_idx].is_ipv6 = 1; + } + else if (unformat (i, "sport %d-%d", &port1, &port2)) + { + vec_validate_acl_rules(rules, rule_idx); + rules[rule_idx].srcport_or_icmptype_first = htons(port1); + rules[rule_idx].srcport_or_icmptype_last = htons(port2); + } + else if (unformat (i, "sport %d", &port1)) + { + vec_validate_acl_rules(rules, rule_idx); + rules[rule_idx].srcport_or_icmptype_first = htons(port1); + rules[rule_idx].srcport_or_icmptype_last = htons(port1); + } + else if (unformat (i, "dport %d-%d", &port1, &port2)) + { + vec_validate_acl_rules(rules, rule_idx); + rules[rule_idx].dstport_or_icmpcode_first = htons(port1); + rules[rule_idx].dstport_or_icmpcode_last = htons(port2); + } + else if (unformat (i, "dport %d", &port1)) + { + vec_validate_acl_rules(rules, rule_idx); + rules[rule_idx].dstport_or_icmpcode_first = htons(port1); + rules[rule_idx].dstport_or_icmpcode_last = htons(port1); + } + else if (unformat (i, "tcpflags %d %d", &tcpflags, &tcpmask)) + { + vec_validate_acl_rules(rules, rule_idx); + rules[rule_idx].tcp_flags_value = tcpflags; + rules[rule_idx].tcp_flags_mask = tcpmask; + } + else if (unformat (i, "tcpflags %d mask %d", &tcpflags, &tcpmask)) + { + vec_validate_acl_rules(rules, rule_idx); + rules[rule_idx].tcp_flags_value = tcpflags; + rules[rule_idx].tcp_flags_mask = tcpmask; + } + else if (unformat (i, "proto %d", &proto)) + { + vec_validate_acl_rules(rules, rule_idx); + rules[rule_idx].proto = proto; + } + else if (unformat (i, "tag %s", &tag)) + { + } + else if (unformat (i, ",")) + { + rule_idx++; + vec_validate_acl_rules(rules, rule_idx); + } + else + break; + } + + /* Construct the API message */ + vam->result_ready = 0; + + if(rules) + n_rules = vec_len(rules); + else + n_rules = 0; + + if (n_rules_override >= 0) + n_rules = n_rules_override; + + msg_size += n_rules*sizeof(rules[0]); + + mp = vl_msg_api_alloc_as_if_client(msg_size); + memset (mp, 0, msg_size); + mp->_vl_msg_id = ntohs (VL_API_ACL_ADD_REPLACE + sm->msg_id_base); + mp->client_index = vam->my_client_index; + if ((n_rules > 0) && rules) + clib_memcpy(mp->r, rules, n_rules*sizeof (vl_api_acl_rule_t)); + if (tag) + { + if (vec_len(tag) >= sizeof(mp->tag)) + { + tag[sizeof(mp->tag)-1] = 0; + _vec_len(tag) = sizeof(mp->tag); + } + clib_memcpy(mp->tag, tag, vec_len(tag)); + vec_free(tag); + } + mp->acl_index = ntohl(acl_index); + mp->count = htonl(n_rules); + + /* send it... */ + S(mp); + + /* Wait for a reply... */ + W (ret); + return ret; +} + +static int api_acl_del (vat_main_t * vam) +{ + unformat_input_t * i = vam->input; + vl_api_acl_del_t * mp; + u32 acl_index = ~0; + int ret; + + if (!unformat (i, "%d", &acl_index)) { + errmsg ("missing acl index\n"); + return -99; + } + + /* Construct the API message */ + M(ACL_DEL, mp); + mp->acl_index = ntohl(acl_index); + + /* send it... */ + S(mp); + + /* Wait for a reply... */ + W (ret); + return ret; +} + +static int api_macip_acl_del (vat_main_t * vam) +{ + unformat_input_t * i = vam->input; + vl_api_acl_del_t * mp; + u32 acl_index = ~0; + int ret; + + if (!unformat (i, "%d", &acl_index)) { + errmsg ("missing acl index\n"); + return -99; + } + + /* Construct the API message */ + M(MACIP_ACL_DEL, mp); + mp->acl_index = ntohl(acl_index); + + /* send it... */ + S(mp); + + /* Wait for a reply... */ + W (ret); + return ret; +} + +static int api_acl_interface_add_del (vat_main_t * vam) +{ + unformat_input_t * i = vam->input; + vl_api_acl_interface_add_del_t * mp; + u32 sw_if_index = ~0; + u32 acl_index = ~0; + u8 is_input = 0; + u8 is_add = 0; + int ret; + +// acl_interface_add_del <intfc> | sw_if_index <if-idx> acl_index <acl-idx> [out] [del] + + while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT) + { + if (unformat (i, "%d", &acl_index)) + ; + else + break; + } + + + /* Parse args required to build the message */ + while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT) { + if (unformat (i, "%U", unformat_sw_if_index, vam, &sw_if_index)) + ; + else if (unformat (i, "sw_if_index %d", &sw_if_index)) + ; + else if (unformat (i, "add")) + is_add = 1; + else if (unformat (i, "del")) + is_add = 0; + else if (unformat (i, "acl %d", &acl_index)) + ; + else if (unformat (i, "input")) + is_input = 1; + else if (unformat (i, "output")) + is_input = 0; + else + break; + } + + if (sw_if_index == ~0) { + errmsg ("missing interface name / explicit sw_if_index number \n"); + return -99; + } + + if (acl_index == ~0) { + errmsg ("missing ACL index\n"); + return -99; + } + + + + /* Construct the API message */ + M(ACL_INTERFACE_ADD_DEL, mp); + mp->acl_index = ntohl(acl_index); + mp->sw_if_index = ntohl(sw_if_index); + mp->is_add = is_add; + mp->is_input = is_input; + + /* send it... */ + S(mp); + + /* Wait for a reply... */ + W (ret); + return ret; +} + +static int api_macip_acl_interface_add_del (vat_main_t * vam) +{ + unformat_input_t * i = vam->input; + vl_api_macip_acl_interface_add_del_t * mp; + u32 sw_if_index = ~0; + u32 acl_index = ~0; + u8 is_add = 0; + int ret; + + /* Parse args required to build the message */ + while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT) { + if (unformat (i, "%U", unformat_sw_if_index, vam, &sw_if_index)) + ; + else if (unformat (i, "sw_if_index %d", &sw_if_index)) + ; + else if (unformat (i, "add")) + is_add = 1; + else if (unformat (i, "del")) + is_add = 0; + else if (unformat (i, "acl %d", &acl_index)) + ; + else + break; + } + + if (sw_if_index == ~0) { + errmsg ("missing interface name / explicit sw_if_index number \n"); + return -99; + } + + if (acl_index == ~0) { + errmsg ("missing ACL index\n"); + return -99; + } + + + + /* Construct the API message */ + M(MACIP_ACL_INTERFACE_ADD_DEL, mp); + mp->acl_index = ntohl(acl_index); + mp->sw_if_index = ntohl(sw_if_index); + mp->is_add = is_add; + + /* send it... */ + S(mp); + + /* Wait for a reply... */ + W (ret); + return ret; +} + +static int api_acl_interface_set_acl_list (vat_main_t * vam) +{ + unformat_input_t * i = vam->input; + vl_api_acl_interface_set_acl_list_t * mp; + u32 sw_if_index = ~0; + u32 acl_index = ~0; + u32 *inacls = 0; + u32 *outacls = 0; + u8 is_input = 0; + int ret; + +// acl_interface_set_acl_list <intfc> | sw_if_index <if-idx> input [acl-idx list] output [acl-idx list] + + /* Parse args required to build the message */ + while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT) { + if (unformat (i, "%U", unformat_sw_if_index, vam, &sw_if_index)) + ; + else if (unformat (i, "sw_if_index %d", &sw_if_index)) + ; + else if (unformat (i, "%d", &acl_index)) + { + if(is_input) + vec_add1(inacls, htonl(acl_index)); + else + vec_add1(outacls, htonl(acl_index)); + } + else if (unformat (i, "acl %d", &acl_index)) + ; + else if (unformat (i, "input")) + is_input = 1; + else if (unformat (i, "output")) + is_input = 0; + else + break; + } + + if (sw_if_index == ~0) { + errmsg ("missing interface name / explicit sw_if_index number \n"); + return -99; + } + + /* Construct the API message */ + M2(ACL_INTERFACE_SET_ACL_LIST, mp, sizeof(u32) * (vec_len(inacls) + vec_len(outacls))); + mp->sw_if_index = ntohl(sw_if_index); + mp->n_input = vec_len(inacls); + mp->count = vec_len(inacls) + vec_len(outacls); + vec_append(inacls, outacls); + if (vec_len(inacls) > 0) + clib_memcpy(mp->acls, inacls, vec_len(inacls)*sizeof(u32)); + + /* send it... */ + S(mp); + + /* Wait for a reply... */ + W (ret); + return ret; +} + +static void +api_acl_send_control_ping(vat_main_t *vam) +{ + vl_api_acl_plugin_control_ping_t *mp_ping; + + M(ACL_PLUGIN_CONTROL_PING, mp_ping); + S(mp_ping); +} + + +static int api_acl_interface_list_dump (vat_main_t * vam) +{ + unformat_input_t * i = vam->input; + u32 sw_if_index = ~0; + vl_api_acl_interface_list_dump_t * mp; + int ret; + + /* Parse args required to build the message */ + while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT) { + if (unformat (i, "%U", unformat_sw_if_index, vam, &sw_if_index)) + ; + else if (unformat (i, "sw_if_index %d", &sw_if_index)) + ; + else + break; + } + + /* Construct the API message */ + M(ACL_INTERFACE_LIST_DUMP, mp); + mp->sw_if_index = ntohl (sw_if_index); + + /* send it... */ + S(mp); + + /* Use control ping for synchronization */ + api_acl_send_control_ping(vam); + + /* Wait for a reply... */ + W (ret); + return ret; +} + +static int api_acl_dump (vat_main_t * vam) +{ + unformat_input_t * i = vam->input; + u32 acl_index = ~0; + vl_api_acl_dump_t * mp; + int ret; + + /* Parse args required to build the message */ + while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT) { + if (unformat (i, "%d", &acl_index)) + ; + else + break; + } + + /* Construct the API message */ + M(ACL_DUMP, mp); + mp->acl_index = ntohl (acl_index); + + /* send it... */ + S(mp); + + /* Use control ping for synchronization */ + api_acl_send_control_ping(vam); + + /* Wait for a reply... */ + W (ret); + return ret; +} + +static int api_macip_acl_dump (vat_main_t * vam) +{ + unformat_input_t * i = vam->input; + u32 acl_index = ~0; + vl_api_acl_dump_t * mp; + int ret; + + /* Parse args required to build the message */ + while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT) { + if (unformat (i, "%d", &acl_index)) + ; + else + break; + } + + /* Construct the API message */ + M(MACIP_ACL_DUMP, mp); + mp->acl_index = ntohl (acl_index); + + /* send it... */ + S(mp); + + /* Use control ping for synchronization */ + api_acl_send_control_ping(vam); + + /* Wait for a reply... */ + W (ret); + return ret; +} + +#define vec_validate_macip_acl_rules(v, idx) \ + do { \ + if (vec_len(v) < idx+1) { \ + vec_validate(v, idx); \ + v[idx].is_permit = 0x1; \ + } \ + } while (0) + + +static int api_macip_acl_add (vat_main_t * vam) +{ + acl_test_main_t * sm = &acl_test_main; + unformat_input_t * i = vam->input; + vl_api_macip_acl_add_t * mp; + u32 msg_size = sizeof (*mp); /* without the rules */ + + vl_api_macip_acl_rule_t *rules = 0; + int rule_idx = 0; + int n_rules = 0; + int n_rules_override = -1; + u32 src_prefix_length = 0; + u32 action = 0; + ip4_address_t src_v4address; + ip6_address_t src_v6address; + u8 src_mac[6]; + u8 *tag = 0; + u8 mac_mask_all_1[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; + int ret; + + while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT) + { + if (unformat (i, "ipv6")) + { + vec_validate_macip_acl_rules(rules, rule_idx); + rules[rule_idx].is_ipv6 = 1; + } + else if (unformat (i, "ipv4")) + { + vec_validate_macip_acl_rules(rules, rule_idx); + rules[rule_idx].is_ipv6 = 0; + } + else if (unformat (i, "permit")) + { + vec_validate_macip_acl_rules(rules, rule_idx); + rules[rule_idx].is_permit = 1; + } + else if (unformat (i, "deny")) + { + vec_validate_macip_acl_rules(rules, rule_idx); + rules[rule_idx].is_permit = 0; + } + else if (unformat (i, "count %d", &n_rules_override)) + { + /* we will use this later */ + } + else if (unformat (i, "action %d", &action)) + { + vec_validate_macip_acl_rules(rules, rule_idx); + rules[rule_idx].is_permit = action; + } + else if (unformat (i, "ip %U/%d", + unformat_ip4_address, &src_v4address, &src_prefix_length) || + unformat (i, "ip %U", + unformat_ip4_address, &src_v4address)) + { + if (src_prefix_length == 0) + src_prefix_length = 32; + vec_validate_macip_acl_rules(rules, rule_idx); + memcpy (rules[rule_idx].src_ip_addr, &src_v4address, 4); + rules[rule_idx].src_ip_prefix_len = src_prefix_length; + rules[rule_idx].is_ipv6 = 0; + } + else if (unformat (i, "src")) + { + /* Everything in MACIP is "source" but allow this verbosity */ + } + else if (unformat (i, "ip %U/%d", + unformat_ip6_address, &src_v6address, &src_prefix_length) || + unformat (i, "ip %U", + unformat_ip6_address, &src_v6address)) + { + if (src_prefix_length == 0) + src_prefix_length = 128; + vec_validate_macip_acl_rules(rules, rule_idx); + memcpy (rules[rule_idx].src_ip_addr, &src_v6address, 16); + rules[rule_idx].src_ip_prefix_len = src_prefix_length; + rules[rule_idx].is_ipv6 = 1; + } + else if (unformat (i, "mac %U", + my_unformat_mac_address, &src_mac)) + { + vec_validate_macip_acl_rules(rules, rule_idx); + memcpy (rules[rule_idx].src_mac, &src_mac, 6); + memcpy (rules[rule_idx].src_mac_mask, &mac_mask_all_1, 6); + } + else if (unformat (i, "mask %U", + my_unformat_mac_address, &src_mac)) + { + vec_validate_macip_acl_rules(rules, rule_idx); + memcpy (rules[rule_idx].src_mac_mask, &src_mac, 6); + } + else if (unformat (i, "tag %s", &tag)) + { + } + else if (unformat (i, ",")) + { + rule_idx++; + vec_validate_macip_acl_rules(rules, rule_idx); + } + else + break; + } + + /* Construct the API message */ + vam->result_ready = 0; + + if(rules) + n_rules = vec_len(rules); + + if (n_rules_override >= 0) + n_rules = n_rules_override; + + msg_size += n_rules*sizeof(rules[0]); + + mp = vl_msg_api_alloc_as_if_client(msg_size); + memset (mp, 0, msg_size); + mp->_vl_msg_id = ntohs (VL_API_MACIP_ACL_ADD + sm->msg_id_base); + mp->client_index = vam->my_client_index; + if ((n_rules > 0) && rules) + clib_memcpy(mp->r, rules, n_rules*sizeof (mp->r[0])); + if (tag) + { + if (vec_len(tag) >= sizeof(mp->tag)) + { + tag[sizeof(mp->tag)-1] = 0; + _vec_len(tag) = sizeof(mp->tag); + } + clib_memcpy(mp->tag, tag, vec_len(tag)); + vec_free(tag); + } + + mp->count = htonl(n_rules); + + /* send it... */ + S(mp); + + /* Wait for a reply... */ + W (ret); + return ret; +} + +static int api_macip_acl_add_replace (vat_main_t * vam) +{ + acl_test_main_t * sm = &acl_test_main; + unformat_input_t * i = vam->input; + vl_api_macip_acl_add_replace_t * mp; + u32 acl_index = ~0; + u32 msg_size = sizeof (*mp); /* without the rules */ + + vl_api_macip_acl_rule_t *rules = 0; + int rule_idx = 0; + int n_rules = 0; + int n_rules_override = -1; + u32 src_prefix_length = 0; + u32 action = 0; + ip4_address_t src_v4address; + ip6_address_t src_v6address; + u8 src_mac[6]; + u8 *tag = 0; + u8 mac_mask_all_1[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; + int ret; + + if (!unformat (i, "%d", &acl_index)) { + /* Just assume -1 */ + } + + while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT) + { + if (unformat (i, "ipv6")) + { + vec_validate_macip_acl_rules(rules, rule_idx); + rules[rule_idx].is_ipv6 = 1; + } + else if (unformat (i, "ipv4")) + { + vec_validate_macip_acl_rules(rules, rule_idx); + rules[rule_idx].is_ipv6 = 0; + } + else if (unformat (i, "permit")) + { + vec_validate_macip_acl_rules(rules, rule_idx); + rules[rule_idx].is_permit = 1; + } + else if (unformat (i, "deny")) + { + vec_validate_macip_acl_rules(rules, rule_idx); + rules[rule_idx].is_permit = 0; + } + else if (unformat (i, "count %d", &n_rules_override)) + { + /* we will use this later */ + } + else if (unformat (i, "action %d", &action)) + { + vec_validate_macip_acl_rules(rules, rule_idx); + rules[rule_idx].is_permit = action; + } + else if (unformat (i, "ip %U/%d", + unformat_ip4_address, &src_v4address, &src_prefix_length) || + unformat (i, "ip %U", + unformat_ip4_address, &src_v4address)) + { + if (src_prefix_length == 0) + src_prefix_length = 32; + vec_validate_macip_acl_rules(rules, rule_idx); + memcpy (rules[rule_idx].src_ip_addr, &src_v4address, 4); + rules[rule_idx].src_ip_prefix_len = src_prefix_length; + rules[rule_idx].is_ipv6 = 0; + } + else if (unformat (i, "src")) + { + /* Everything in MACIP is "source" but allow this verbosity */ + } + else if (unformat (i, "ip %U/%d", + unformat_ip6_address, &src_v6address, &src_prefix_length) || + unformat (i, "ip %U", + unformat_ip6_address, &src_v6address)) + { + if (src_prefix_length == 0) + src_prefix_length = 128; + vec_validate_macip_acl_rules(rules, rule_idx); + memcpy (rules[rule_idx].src_ip_addr, &src_v6address, 16); + rules[rule_idx].src_ip_prefix_len = src_prefix_length; + rules[rule_idx].is_ipv6 = 1; + } + else if (unformat (i, "mac %U", + my_unformat_mac_address, &src_mac)) + { + vec_validate_macip_acl_rules(rules, rule_idx); + memcpy (rules[rule_idx].src_mac, &src_mac, 6); + memcpy (rules[rule_idx].src_mac_mask, &mac_mask_all_1, 6); + } + else if (unformat (i, "mask %U", + my_unformat_mac_address, &src_mac)) + { + vec_validate_macip_acl_rules(rules, rule_idx); + memcpy (rules[rule_idx].src_mac_mask, &src_mac, 6); + } + else if (unformat (i, "tag %s", &tag)) + { + } + else if (unformat (i, ",")) + { + rule_idx++; + vec_validate_macip_acl_rules(rules, rule_idx); + } + else + break; + } + + if (!rules) + { + errmsg ("rule/s required\n"); + return -99; + } + /* Construct the API message */ + vam->result_ready = 0; + + if(rules) + n_rules = vec_len(rules); + + if (n_rules_override >= 0) + n_rules = n_rules_override; + + msg_size += n_rules*sizeof(rules[0]); + + mp = vl_msg_api_alloc_as_if_client(msg_size); + memset (mp, 0, msg_size); + mp->_vl_msg_id = ntohs (VL_API_MACIP_ACL_ADD_REPLACE + sm->msg_id_base); + mp->client_index = vam->my_client_index; + if ((n_rules > 0) && rules) + clib_memcpy(mp->r, rules, n_rules*sizeof (mp->r[0])); + if (tag) + { + if (vec_len(tag) >= sizeof(mp->tag)) + { + tag[sizeof(mp->tag)-1] = 0; + _vec_len(tag) = sizeof(mp->tag); + } + clib_memcpy(mp->tag, tag, vec_len(tag)); + vec_free(tag); + } + + mp->acl_index = ntohl(acl_index); + mp->count = htonl(n_rules); + + /* send it... */ + S(mp); + + /* Wait for a reply... */ + W (ret); + return ret; +} + +/* + * List of messages that the api test plugin sends, + * and that the data plane plugin processes + */ +#define foreach_vpe_api_msg \ +_(acl_plugin_get_version, "") \ +_(acl_add_replace, "<acl-idx> [<ipv4|ipv6> <permit|permit+reflect|deny|action N> [src IP/plen] [dst IP/plen] [sport X-Y] [dport X-Y] [proto P] [tcpflags FL MASK], ... , ...") \ +_(acl_del, "<acl-idx>") \ +_(acl_dump, "[<acl-idx>]") \ +_(acl_interface_add_del, "<intfc> | sw_if_index <if-idx> [add|del] [input|output] acl <acl-idx>") \ +_(acl_interface_set_acl_list, "<intfc> | sw_if_index <if-idx> input [acl-idx list] output [acl-idx list]") \ +_(acl_interface_list_dump, "[<intfc> | sw_if_index <if-idx>]") \ +_(macip_acl_add, "...") \ +_(macip_acl_add_replace, "<acl-idx> [<ipv4|ipv6> <permit|deny|action N> [count <count>] [src] ip <ipaddress/[plen]> mac <mac> mask <mac_mask>, ... , ...") \ +_(macip_acl_del, "<acl-idx>")\ +_(macip_acl_dump, "[<acl-idx>]") \ +_(macip_acl_interface_add_del, "<intfc> | sw_if_index <if-idx> [add|del] acl <acl-idx>") \ +_(macip_acl_interface_get, "") + + +static +void acl_vat_api_hookup (vat_main_t *vam) +{ + acl_test_main_t * sm = &acl_test_main; + /* Hook up handlers for replies from the data plane plug-in */ +#define _(N,n) \ + vl_msg_api_set_handlers((VL_API_##N + sm->msg_id_base), \ + #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_vpe_api_reply_msg; +#undef _ + + /* API messages we can send */ +#define _(n,h) hash_set_mem (vam->function_by_name, #n, api_##n); + foreach_vpe_api_msg; +#undef _ + + /* Help strings */ +#define _(n,h) hash_set_mem (vam->help_by_name, #n, h); + foreach_vpe_api_msg; +#undef _ +} + +clib_error_t * vat_plugin_register (vat_main_t *vam) +{ + acl_test_main_t * sm = &acl_test_main; + u8 * name; + + sm->vat_main = vam; + + name = format (0, "acl_%08x%c", api_version, 0); + sm->msg_id_base = vl_client_get_first_plugin_msg_id ((char *) name); + + if (sm->msg_id_base != (u16) ~0) + acl_vat_api_hookup (vam); + + vec_free(name); + + return 0; +} diff --git a/src/plugins/acl/fa_node.c b/src/plugins/acl/fa_node.c new file mode 100644 index 00000000..a4ba967d --- /dev/null +++ b/src/plugins/acl/fa_node.c @@ -0,0 +1,1874 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <stddef.h> +#include <netinet/in.h> + +#include <vlib/vlib.h> +#include <vnet/vnet.h> +#include <vnet/pg/pg.h> +#include <vppinfra/error.h> +#include <acl/acl.h> +#include <vppinfra/bihash_40_8.h> + +#include <vppinfra/bihash_template.h> +#include <vppinfra/bihash_template.c> + +#include "fa_node.h" +#include "hash_lookup.h" + +typedef struct +{ + u32 next_index; + u32 sw_if_index; + u32 match_acl_in_index; + u32 match_rule_index; + u64 packet_info[6]; + u32 trace_bitmap; + u8 action; +} acl_fa_trace_t; + +/* packet trace format function */ +static u8 * +format_acl_fa_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + acl_fa_trace_t *t = va_arg (*args, acl_fa_trace_t *); + + s = + format (s, + "acl-plugin: sw_if_index %d, next index %d, action: %d, match: acl %d rule %d trace_bits %08x\n" + " pkt info %016llx %016llx %016llx %016llx %016llx %016llx", + t->sw_if_index, t->next_index, t->action, t->match_acl_in_index, + t->match_rule_index, t->trace_bitmap, + t->packet_info[0], t->packet_info[1], t->packet_info[2], + t->packet_info[3], t->packet_info[4], t->packet_info[5]); + return s; +} + +/* *INDENT-OFF* */ +#define foreach_acl_fa_error \ +_(ACL_DROP, "ACL deny packets") \ +_(ACL_PERMIT, "ACL permit packets") \ +_(ACL_NEW_SESSION, "new sessions added") \ +_(ACL_EXIST_SESSION, "existing session packets") \ +_(ACL_CHECK, "checked packets") \ +_(ACL_RESTART_SESSION_TIMER, "restart session timer") \ +_(ACL_TOO_MANY_SESSIONS, "too many sessions to add new") \ +/* end of errors */ + +typedef enum +{ +#define _(sym,str) ACL_FA_ERROR_##sym, + foreach_acl_fa_error +#undef _ + ACL_FA_N_ERROR, +} acl_fa_error_t; + +static char *acl_fa_error_strings[] = { +#define _(sym,string) string, + foreach_acl_fa_error +#undef _ +}; +/* *INDENT-ON* */ + +static void * +get_ptr_to_offset (vlib_buffer_t * b0, int offset) +{ + u8 *p = vlib_buffer_get_current (b0) + offset; + return p; +} + + +static int +fa_acl_match_addr (ip46_address_t * addr1, ip46_address_t * addr2, + int prefixlen, int is_ip6) +{ + if (prefixlen == 0) + { + /* match any always succeeds */ + return 1; + } + if (is_ip6) + { + if (memcmp (addr1, addr2, prefixlen / 8)) + { + /* If the starting full bytes do not match, no point in bittwidling the thumbs further */ + return 0; + } + if (prefixlen % 8) + { + u8 b1 = *((u8 *) addr1 + 1 + prefixlen / 8); + u8 b2 = *((u8 *) addr2 + 1 + prefixlen / 8); + u8 mask0 = (0xff - ((1 << (8 - (prefixlen % 8))) - 1)); + return (b1 & mask0) == b2; + } + else + { + /* The prefix fits into integer number of bytes, so nothing left to do */ + return 1; + } + } + else + { + uint32_t a1 = ntohl (addr1->ip4.as_u32); + uint32_t a2 = ntohl (addr2->ip4.as_u32); + uint32_t mask0 = 0xffffffff - ((1 << (32 - prefixlen)) - 1); + return (a1 & mask0) == a2; + } +} + +static int +fa_acl_match_port (u16 port, u16 port_first, u16 port_last, int is_ip6) +{ + return ((port >= port_first) && (port <= port_last)); +} + +int +single_acl_match_5tuple (acl_main_t * am, u32 acl_index, fa_5tuple_t * pkt_5tuple, + int is_ip6, u8 * r_action, u32 * r_acl_match_p, + u32 * r_rule_match_p, u32 * trace_bitmap) +{ + int i; + acl_list_t *a; + acl_rule_t *r; + + if (pool_is_free_index (am->acls, acl_index)) + { + if (r_acl_match_p) + *r_acl_match_p = acl_index; + if (r_rule_match_p) + *r_rule_match_p = -1; + /* the ACL does not exist but is used for policy. Block traffic. */ + return 0; + } + a = am->acls + acl_index; + for (i = 0; i < a->count; i++) + { + r = a->rules + i; + if (is_ip6 != r->is_ipv6) + { + continue; + } + if (!fa_acl_match_addr + (&pkt_5tuple->addr[1], &r->dst, r->dst_prefixlen, is_ip6)) + continue; + +#ifdef FA_NODE_VERBOSE_DEBUG + clib_warning + ("ACL_FA_NODE_DBG acl %d rule %d pkt dst addr %U match rule addr %U/%d", + acl_index, i, format_ip46_address, &pkt_5tuple->addr[1], + IP46_TYPE_ANY, format_ip46_address, &r->dst, IP46_TYPE_ANY, + r->dst_prefixlen); +#endif + + if (!fa_acl_match_addr + (&pkt_5tuple->addr[0], &r->src, r->src_prefixlen, is_ip6)) + continue; + +#ifdef FA_NODE_VERBOSE_DEBUG + clib_warning + ("ACL_FA_NODE_DBG acl %d rule %d pkt src addr %U match rule addr %U/%d", + acl_index, i, format_ip46_address, &pkt_5tuple->addr[0], + IP46_TYPE_ANY, format_ip46_address, &r->src, IP46_TYPE_ANY, + r->src_prefixlen); + clib_warning + ("ACL_FA_NODE_DBG acl %d rule %d trying to match pkt proto %d with rule %d", + acl_index, i, pkt_5tuple->l4.proto, r->proto); +#endif + if (r->proto) + { + if (pkt_5tuple->l4.proto != r->proto) + continue; + + if (PREDICT_FALSE (pkt_5tuple->pkt.is_nonfirst_fragment && + am->l4_match_nonfirst_fragment)) + { + /* non-initial fragment with frag match configured - match this rule */ + *trace_bitmap |= 0x80000000; + *r_action = r->is_permit; + if (r_acl_match_p) + *r_acl_match_p = acl_index; + if (r_rule_match_p) + *r_rule_match_p = i; + return 1; + } + + /* A sanity check just to ensure we are about to match the ports extracted from the packet */ + if (PREDICT_FALSE (!pkt_5tuple->pkt.l4_valid)) + continue; + +#ifdef FA_NODE_VERBOSE_DEBUG + clib_warning + ("ACL_FA_NODE_DBG acl %d rule %d pkt proto %d match rule %d", + acl_index, i, pkt_5tuple->l4.proto, r->proto); +#endif + + if (!fa_acl_match_port + (pkt_5tuple->l4.port[0], r->src_port_or_type_first, + r->src_port_or_type_last, is_ip6)) + continue; + +#ifdef FA_NODE_VERBOSE_DEBUG + clib_warning + ("ACL_FA_NODE_DBG acl %d rule %d pkt sport %d match rule [%d..%d]", + acl_index, i, pkt_5tuple->l4.port[0], r->src_port_or_type_first, + r->src_port_or_type_last); +#endif + + if (!fa_acl_match_port + (pkt_5tuple->l4.port[1], r->dst_port_or_code_first, + r->dst_port_or_code_last, is_ip6)) + continue; + +#ifdef FA_NODE_VERBOSE_DEBUG + clib_warning + ("ACL_FA_NODE_DBG acl %d rule %d pkt dport %d match rule [%d..%d]", + acl_index, i, pkt_5tuple->l4.port[1], r->dst_port_or_code_first, + r->dst_port_or_code_last); +#endif + if (pkt_5tuple->pkt.tcp_flags_valid + && ((pkt_5tuple->pkt.tcp_flags & r->tcp_flags_mask) != + r->tcp_flags_value)) + continue; + } + /* everything matches! */ +#ifdef FA_NODE_VERBOSE_DEBUG + clib_warning ("ACL_FA_NODE_DBG acl %d rule %d FULL-MATCH, action %d", + acl_index, i, r->is_permit); +#endif + *r_action = r->is_permit; + if (r_acl_match_p) + *r_acl_match_p = acl_index; + if (r_rule_match_p) + *r_rule_match_p = i; + return 1; + } + return 0; +} + +static u8 +linear_multi_acl_match_5tuple (u32 sw_if_index, fa_5tuple_t * pkt_5tuple, int is_l2, + int is_ip6, int is_input, u32 * acl_match_p, + u32 * rule_match_p, u32 * trace_bitmap) +{ + acl_main_t *am = &acl_main; + int i; + u32 *acl_vector; + u8 action = 0; + + if (is_input) + { + vec_validate (am->input_acl_vec_by_sw_if_index, sw_if_index); + acl_vector = am->input_acl_vec_by_sw_if_index[sw_if_index]; + } + else + { + vec_validate (am->output_acl_vec_by_sw_if_index, sw_if_index); + acl_vector = am->output_acl_vec_by_sw_if_index[sw_if_index]; + } + for (i = 0; i < vec_len (acl_vector); i++) + { +#ifdef FA_NODE_VERBOSE_DEBUG + clib_warning ("ACL_FA_NODE_DBG: Trying to match ACL: %d", + acl_vector[i]); +#endif + if (single_acl_match_5tuple + (am, acl_vector[i], pkt_5tuple, is_ip6, &action, + acl_match_p, rule_match_p, trace_bitmap)) + { + return action; + } + } + if (vec_len (acl_vector) > 0) + { + /* If there are ACLs and none matched, deny by default */ + return 0; + } +#ifdef FA_NODE_VERBOSE_DEBUG + clib_warning ("ACL_FA_NODE_DBG: No ACL on sw_if_index %d", sw_if_index); +#endif + /* Deny by default. If there are no ACLs defined we should not be here. */ + return 0; +} + +static u8 +multi_acl_match_5tuple (u32 sw_if_index, fa_5tuple_t * pkt_5tuple, int is_l2, + int is_ip6, int is_input, u32 * acl_match_p, + u32 * rule_match_p, u32 * trace_bitmap) +{ + acl_main_t *am = &acl_main; + if (am->use_hash_acl_matching) { + return hash_multi_acl_match_5tuple(sw_if_index, pkt_5tuple, is_l2, is_ip6, + is_input, acl_match_p, rule_match_p, trace_bitmap); + } else { + return linear_multi_acl_match_5tuple(sw_if_index, pkt_5tuple, is_l2, is_ip6, + is_input, acl_match_p, rule_match_p, trace_bitmap); + } +} + +static int +offset_within_packet (vlib_buffer_t * b0, int offset) +{ + /* For the purposes of this code, "within" means we have at least 8 bytes after it */ + return (offset <= (b0->current_length - 8)); +} + +static void +acl_fill_5tuple (acl_main_t * am, vlib_buffer_t * b0, int is_ip6, + int is_input, int is_l2_path, fa_5tuple_t * p5tuple_pkt) +{ + int l3_offset = ethernet_buffer_header_size(b0); + int l4_offset; + u16 ports[2]; + u16 proto; + /* IP4 and IP6 protocol numbers of ICMP */ + static u8 icmp_protos[] = { IP_PROTOCOL_ICMP, IP_PROTOCOL_ICMP6 }; + + if (is_input && !(is_l2_path)) + { + l3_offset = 0; + } + + /* key[0..3] contains src/dst address and is cleared/set below */ + /* Remainder of the key and per-packet non-key data */ + p5tuple_pkt->kv.key[4] = 0; + p5tuple_pkt->kv.value = 0; + + if (is_ip6) + { + clib_memcpy (&p5tuple_pkt->addr, + get_ptr_to_offset (b0, + offsetof (ip6_header_t, + src_address) + l3_offset), + sizeof (p5tuple_pkt->addr)); + proto = + *(u8 *) get_ptr_to_offset (b0, + offsetof (ip6_header_t, + protocol) + l3_offset); + l4_offset = l3_offset + sizeof (ip6_header_t); +#ifdef FA_NODE_VERBOSE_DEBUG + clib_warning ("ACL_FA_NODE_DBG: proto: %d, l4_offset: %d", proto, + l4_offset); +#endif + /* IP6 EH handling is here, increment l4_offset if needs to, update the proto */ + int need_skip_eh = clib_bitmap_get (am->fa_ipv6_known_eh_bitmap, proto); + if (PREDICT_FALSE (need_skip_eh)) + { + while (need_skip_eh && offset_within_packet (b0, l4_offset)) + { + /* Fragment header needs special handling */ + if (PREDICT_FALSE(ACL_EH_FRAGMENT == proto)) + { + proto = *(u8 *) get_ptr_to_offset (b0, l4_offset); + u16 frag_offset; + clib_memcpy (&frag_offset, get_ptr_to_offset (b0, 2 + l4_offset), sizeof(frag_offset)); + frag_offset = ntohs(frag_offset) >> 3; + if (frag_offset) + { + p5tuple_pkt->pkt.is_nonfirst_fragment = 1; + /* invalidate L4 offset so we don't try to find L4 info */ + l4_offset += b0->current_length; + } + else + { + /* First fragment: skip the frag header and move on. */ + l4_offset += 8; + } + } + else + { + u8 nwords = *(u8 *) get_ptr_to_offset (b0, 1 + l4_offset); + proto = *(u8 *) get_ptr_to_offset (b0, l4_offset); + l4_offset += 8 * (1 + (u16) nwords); + } +#ifdef FA_NODE_VERBOSE_DEBUG + clib_warning ("ACL_FA_NODE_DBG: new proto: %d, new offset: %d", + proto, l4_offset); +#endif + need_skip_eh = + clib_bitmap_get (am->fa_ipv6_known_eh_bitmap, proto); + } + } + } + else + { + p5tuple_pkt->kv.key[0] = 0; + p5tuple_pkt->kv.key[1] = 0; + p5tuple_pkt->kv.key[2] = 0; + p5tuple_pkt->kv.key[3] = 0; + clib_memcpy (&p5tuple_pkt->addr[0].ip4, + get_ptr_to_offset (b0, + offsetof (ip4_header_t, + src_address) + l3_offset), + sizeof (p5tuple_pkt->addr[0].ip4)); + clib_memcpy (&p5tuple_pkt->addr[1].ip4, + get_ptr_to_offset (b0, + offsetof (ip4_header_t, + dst_address) + l3_offset), + sizeof (p5tuple_pkt->addr[1].ip4)); + proto = + *(u8 *) get_ptr_to_offset (b0, + offsetof (ip4_header_t, + protocol) + l3_offset); + l4_offset = l3_offset + sizeof (ip4_header_t); + u16 flags_and_fragment_offset; + clib_memcpy (&flags_and_fragment_offset, + get_ptr_to_offset (b0, + offsetof (ip4_header_t, + flags_and_fragment_offset)) + l3_offset, + sizeof(flags_and_fragment_offset)); + flags_and_fragment_offset = ntohs (flags_and_fragment_offset); + + /* non-initial fragments have non-zero offset */ + if ((PREDICT_FALSE(0xfff & flags_and_fragment_offset))) + { + p5tuple_pkt->pkt.is_nonfirst_fragment = 1; + /* invalidate L4 offset so we don't try to find L4 info */ + l4_offset += b0->current_length; + } + + } + p5tuple_pkt->l4.proto = proto; + if (PREDICT_TRUE (offset_within_packet (b0, l4_offset))) + { + p5tuple_pkt->pkt.l4_valid = 1; + if (icmp_protos[is_ip6] == proto) + { + /* type */ + p5tuple_pkt->l4.port[0] = + *(u8 *) get_ptr_to_offset (b0, + l4_offset + offsetof (icmp46_header_t, + type)); + /* code */ + p5tuple_pkt->l4.port[1] = + *(u8 *) get_ptr_to_offset (b0, + l4_offset + offsetof (icmp46_header_t, + code)); + } + else if ((IPPROTO_TCP == proto) || (IPPROTO_UDP == proto)) + { + clib_memcpy (&ports, + get_ptr_to_offset (b0, + l4_offset + offsetof (tcp_header_t, + src_port)), + sizeof (ports)); + p5tuple_pkt->l4.port[0] = ntohs (ports[0]); + p5tuple_pkt->l4.port[1] = ntohs (ports[1]); + + p5tuple_pkt->pkt.tcp_flags = + *(u8 *) get_ptr_to_offset (b0, + l4_offset + offsetof (tcp_header_t, + flags)); + p5tuple_pkt->pkt.tcp_flags_valid = (proto == IPPROTO_TCP); + } + /* + * FIXME: rather than the above conditional, here could + * be a nice generic mechanism to extract two L4 values: + * + * have a per-protocol array of 4 elements like this: + * u8 offset; to take the byte from, off L4 header + * u8 mask; to mask it with, before storing + * + * this way we can describe UDP, TCP and ICMP[46] semantics, + * and add a sort of FPM-type behavior for other protocols. + * + * Of course, is it faster ? and is it needed ? + * + */ + } +} + + +/* Session keys match the packets received, and mirror the packets sent */ +static void +acl_make_5tuple_session_key (int is_input, fa_5tuple_t * p5tuple_pkt, + fa_5tuple_t * p5tuple_sess) +{ + int src_index = is_input ? 0 : 1; + int dst_index = is_input ? 1 : 0; + p5tuple_sess->addr[src_index] = p5tuple_pkt->addr[0]; + p5tuple_sess->addr[dst_index] = p5tuple_pkt->addr[1]; + p5tuple_sess->l4.as_u64 = p5tuple_pkt->l4.as_u64; + p5tuple_sess->l4.port[src_index] = p5tuple_pkt->l4.port[0]; + p5tuple_sess->l4.port[dst_index] = p5tuple_pkt->l4.port[1]; +} + + +static int +acl_fa_ifc_has_sessions (acl_main_t * am, int sw_if_index0) +{ + return am->fa_sessions_hash_is_initialized; +} + +static int +acl_fa_ifc_has_in_acl (acl_main_t * am, int sw_if_index0) +{ + int it_has = clib_bitmap_get (am->fa_in_acl_on_sw_if_index, sw_if_index0); + return it_has; +} + +static int +acl_fa_ifc_has_out_acl (acl_main_t * am, int sw_if_index0) +{ + int it_has = clib_bitmap_get (am->fa_out_acl_on_sw_if_index, sw_if_index0); + return it_has; +} + + +static int +fa_session_get_timeout_type (acl_main_t * am, fa_session_t * sess) +{ + /* seen both SYNs and ACKs but not FINs means we are in establshed state */ + u16 masked_flags = + sess->tcp_flags_seen.as_u16 & ((TCP_FLAGS_RSTFINACKSYN << 8) + + TCP_FLAGS_RSTFINACKSYN); + switch (sess->info.l4.proto) + { + case IPPROTO_TCP: + if (((TCP_FLAGS_ACKSYN << 8) + TCP_FLAGS_ACKSYN) == masked_flags) + { + return ACL_TIMEOUT_TCP_IDLE; + } + else + { + return ACL_TIMEOUT_TCP_TRANSIENT; + } + break; + case IPPROTO_UDP: + return ACL_TIMEOUT_UDP_IDLE; + break; + default: + return ACL_TIMEOUT_UDP_IDLE; + } +} + + +static u64 +fa_session_get_shortest_timeout(acl_main_t * am) +{ + int timeout_type; + u64 timeout = ~0LL; + for(timeout_type = 0; timeout_type < ACL_N_TIMEOUTS; timeout_type++) { + if (timeout > am->session_timeout_sec[timeout_type]) { + timeout = am->session_timeout_sec[timeout_type]; + } + } + return timeout; +} + +/* + * Get the timeout of the session in a list since its enqueue time. + */ + +static u64 +fa_session_get_list_timeout (acl_main_t * am, fa_session_t * sess) +{ + u64 timeout = am->vlib_main->clib_time.clocks_per_second; + /* + * we have the shortest possible timeout type in all the lists + * (see README-multicore for the rationale) + */ + timeout *= fa_session_get_shortest_timeout(am); + return timeout; +} + +/* + * Get the idle timeout of a session. + */ + +static u64 +fa_session_get_timeout (acl_main_t * am, fa_session_t * sess) +{ + u64 timeout = am->vlib_main->clib_time.clocks_per_second; + int timeout_type = fa_session_get_timeout_type (am, sess); + timeout *= am->session_timeout_sec[timeout_type]; + return timeout; +} + +static void +acl_fa_verify_init_sessions (acl_main_t * am) +{ + if (!am->fa_sessions_hash_is_initialized) { + u16 wk; + /* Allocate the per-worker sessions pools */ + for (wk = 0; wk < vec_len (am->per_worker_data); wk++) { + acl_fa_per_worker_data_t *pw = &am->per_worker_data[wk]; + pool_alloc_aligned(pw->fa_sessions_pool, am->fa_conn_table_max_entries, CLIB_CACHE_LINE_BYTES); + } + + /* ... and the interface session hash table */ + BV (clib_bihash_init) (&am->fa_sessions_hash, + "ACL plugin FA session bihash", + am->fa_conn_table_hash_num_buckets, + am->fa_conn_table_hash_memory_size); + am->fa_sessions_hash_is_initialized = 1; + } +} + +static inline fa_session_t *get_session_ptr(acl_main_t *am, u16 thread_index, u32 session_index) +{ + acl_fa_per_worker_data_t *pw = &am->per_worker_data[thread_index]; + fa_session_t *sess = pool_is_free_index (pw->fa_sessions_pool, session_index) ? 0 : pool_elt_at_index(pw->fa_sessions_pool, session_index); + return sess; +} + +static inline int is_valid_session_ptr(acl_main_t *am, u16 thread_index, fa_session_t *sess) +{ + acl_fa_per_worker_data_t *pw = &am->per_worker_data[thread_index]; + return ((sess != 0) && ((sess - pw->fa_sessions_pool) < pool_len(pw->fa_sessions_pool))); +} + +static void +acl_fa_conn_list_add_session (acl_main_t * am, fa_full_session_id_t sess_id, u64 now) +{ + fa_session_t *sess = get_session_ptr(am, sess_id.thread_index, sess_id.session_index); + u8 list_id = fa_session_get_timeout_type(am, sess); + uword thread_index = os_get_thread_index (); + acl_fa_per_worker_data_t *pw = &am->per_worker_data[thread_index]; + /* the retrieved session thread index must be necessarily the same as the one in the key */ + ASSERT (sess->thread_index == sess_id.thread_index); + /* the retrieved session thread index must be the same as current thread */ + ASSERT (sess->thread_index == thread_index); + sess->link_enqueue_time = now; + sess->link_list_id = list_id; + sess->link_next_idx = ~0; + sess->link_prev_idx = pw->fa_conn_list_tail[list_id]; + if (~0 != pw->fa_conn_list_tail[list_id]) { + fa_session_t *prev_sess = get_session_ptr(am, thread_index, pw->fa_conn_list_tail[list_id]); + prev_sess->link_next_idx = sess_id.session_index; + /* We should never try to link with a session on another thread */ + ASSERT(prev_sess->thread_index == sess->thread_index); + } + pw->fa_conn_list_tail[list_id] = sess_id.session_index; + pw->serviced_sw_if_index_bitmap = clib_bitmap_set(pw->serviced_sw_if_index_bitmap, sess->sw_if_index, 1); + + if (~0 == pw->fa_conn_list_head[list_id]) { + pw->fa_conn_list_head[list_id] = sess_id.session_index; + } +} + +static int +acl_fa_conn_list_delete_session (acl_main_t *am, fa_full_session_id_t sess_id) +{ + uword thread_index = os_get_thread_index (); + acl_fa_per_worker_data_t *pw = &am->per_worker_data[thread_index]; + if (thread_index != sess_id.thread_index) { + /* If another thread attempts to delete the session, fail it. */ +#ifdef FA_NODE_VERBOSE_DEBUG + clib_warning("thread id in key %d != curr thread index, not deleting"); +#endif + return 0; + } + fa_session_t *sess = get_session_ptr(am, sess_id.thread_index, sess_id.session_index); + /* we should never try to delete the session with another thread index */ + ASSERT(sess->thread_index == thread_index); + if (~0 != sess->link_prev_idx) { + fa_session_t *prev_sess = get_session_ptr(am, thread_index, sess->link_prev_idx); + /* the previous session must be in the same list as this one */ + ASSERT(prev_sess->link_list_id == sess->link_list_id); + prev_sess->link_next_idx = sess->link_next_idx; + } + if (~0 != sess->link_next_idx) { + fa_session_t *next_sess = get_session_ptr(am, thread_index, sess->link_next_idx); + /* The next session must be in the same list as the one we are deleting */ + ASSERT(next_sess->link_list_id == sess->link_list_id); + next_sess->link_prev_idx = sess->link_prev_idx; + } + if (pw->fa_conn_list_head[sess->link_list_id] == sess_id.session_index) { + pw->fa_conn_list_head[sess->link_list_id] = sess->link_next_idx; + } + if (pw->fa_conn_list_tail[sess->link_list_id] == sess_id.session_index) { + pw->fa_conn_list_tail[sess->link_list_id] = sess->link_prev_idx; + } + return 1; +} + +static int +acl_fa_restart_timer_for_session (acl_main_t * am, u64 now, fa_full_session_id_t sess_id) +{ + if (acl_fa_conn_list_delete_session(am, sess_id)) { + acl_fa_conn_list_add_session(am, sess_id, now); + return 1; + } else { + /* + * Our thread does not own this connection, so we can not delete + * The session. To avoid the complicated signaling, we simply + * pick the list waiting time to be the shortest of the timeouts. + * This way we do not have to do anything special, and let + * the regular requeue check take care of everything. + */ + return 0; + } +} + + +static u8 +acl_fa_track_session (acl_main_t * am, int is_input, u32 sw_if_index, u64 now, + fa_session_t * sess, fa_5tuple_t * pkt_5tuple) +{ + sess->last_active_time = now; + if (pkt_5tuple->pkt.tcp_flags_valid) + { + sess->tcp_flags_seen.as_u8[is_input] |= pkt_5tuple->pkt.tcp_flags; + } + return 3; +} + + +static void +acl_fa_delete_session (acl_main_t * am, u32 sw_if_index, fa_full_session_id_t sess_id) +{ + void *oldheap = clib_mem_set_heap(am->acl_mheap); + fa_session_t *sess = get_session_ptr(am, sess_id.thread_index, sess_id.session_index); + ASSERT(sess->thread_index == os_get_thread_index ()); + BV (clib_bihash_add_del) (&am->fa_sessions_hash, + &sess->info.kv, 0); + acl_fa_per_worker_data_t *pw = &am->per_worker_data[sess_id.thread_index]; + pool_put_index (pw->fa_sessions_pool, sess_id.session_index); + /* Deleting from timer structures not needed, + as the caller must have dealt with the timers. */ + vec_validate (pw->fa_session_dels_by_sw_if_index, sw_if_index); + clib_mem_set_heap (oldheap); + pw->fa_session_dels_by_sw_if_index[sw_if_index]++; + clib_smp_atomic_add(&am->fa_session_total_dels, 1); +} + +static int +acl_fa_can_add_session (acl_main_t * am, int is_input, u32 sw_if_index) +{ + u64 curr_sess_count; + curr_sess_count = am->fa_session_total_adds - am->fa_session_total_dels; + return (curr_sess_count < am->fa_conn_table_max_entries); +} + +static u64 +acl_fa_get_list_head_expiry_time(acl_main_t *am, acl_fa_per_worker_data_t *pw, u64 now, u16 thread_index, int timeout_type) +{ + fa_session_t *sess = get_session_ptr(am, thread_index, pw->fa_conn_list_head[timeout_type]); + /* + * We can not check just the index here because inbetween the worker thread might + * dequeue the connection from the head just as we are about to check it. + */ + if (!is_valid_session_ptr(am, thread_index, sess)) { + return ~0LL; // infinity. + } else { + u64 timeout_time = + sess->link_enqueue_time + fa_session_get_list_timeout (am, sess); + return timeout_time; + } +} + +static int +acl_fa_conn_time_to_check (acl_main_t *am, acl_fa_per_worker_data_t *pw, u64 now, u16 thread_index, u32 session_index) +{ + fa_session_t *sess = get_session_ptr(am, thread_index, session_index); + u64 timeout_time = + sess->link_enqueue_time + fa_session_get_list_timeout (am, sess); + return (timeout_time < now) || (sess->link_enqueue_time <= pw->swipe_end_time); +} + +/* + * see if there are sessions ready to be checked, + * do the maintenance (requeue or delete), and + * return the total number of sessions reclaimed. + */ +static int +acl_fa_check_idle_sessions(acl_main_t *am, u16 thread_index, u64 now) +{ + acl_fa_per_worker_data_t *pw = &am->per_worker_data[thread_index]; + fa_full_session_id_t fsid; + fsid.thread_index = thread_index; + int total_expired = 0; + + { + u8 tt = 0; + for(tt = 0; tt < ACL_N_TIMEOUTS; tt++) { + while((vec_len(pw->expired) < am->fa_max_deleted_sessions_per_interval) + && (~0 != pw->fa_conn_list_head[tt]) + && (acl_fa_conn_time_to_check(am, pw, now, thread_index, + pw->fa_conn_list_head[tt]))) { + fsid.session_index = pw->fa_conn_list_head[tt]; + vec_add1(pw->expired, fsid.session_index); + acl_fa_conn_list_delete_session(am, fsid); + } + } + } + + u32 *psid = NULL; + vec_foreach (psid, pw->expired) + { + fsid.session_index = *psid; + if (!pool_is_free_index (pw->fa_sessions_pool, fsid.session_index)) + { + fa_session_t *sess = get_session_ptr(am, thread_index, fsid.session_index); + u32 sw_if_index = sess->sw_if_index; + u64 sess_timeout_time = + sess->last_active_time + fa_session_get_timeout (am, sess); + if ((now < sess_timeout_time) && (0 == clib_bitmap_get(pw->pending_clear_sw_if_index_bitmap, sw_if_index))) + { +#ifdef FA_NODE_VERBOSE_DEBUG + clib_warning ("ACL_FA_NODE_CLEAN: Restarting timer for session %d", + (int) session_index); +#endif + /* There was activity on the session, so the idle timeout + has not passed. Enqueue for another time period. */ + + acl_fa_conn_list_add_session(am, fsid, now); + pw->cnt_session_timer_restarted++; + } + else + { +#ifdef FA_NODE_VERBOSE_DEBUG + clib_warning ("ACL_FA_NODE_CLEAN: Deleting session %d", + (int) session_index); +#endif + acl_fa_delete_session (am, sw_if_index, fsid); + pw->cnt_deleted_sessions++; + } + } + else + { + pw->cnt_already_deleted_sessions++; + } + } + total_expired = vec_len(pw->expired); + /* zero out the vector which we have acted on */ + if (pw->expired) + _vec_len (pw->expired) = 0; + /* if we were advancing and reached the end + * (no more sessions to recycle), reset the fast-forward timestamp */ + + if (pw->swipe_end_time && 0 == total_expired) + pw->swipe_end_time = 0; + return (total_expired); +} + +always_inline void +acl_fa_try_recycle_session (acl_main_t * am, int is_input, u16 thread_index, u32 sw_if_index) +{ + /* try to recycle a TCP transient session */ + acl_fa_per_worker_data_t *pw = &am->per_worker_data[thread_index]; + u8 timeout_type = ACL_TIMEOUT_TCP_TRANSIENT; + fa_full_session_id_t sess_id; + sess_id.session_index = pw->fa_conn_list_head[timeout_type]; + if (~0 != sess_id.session_index) { + sess_id.thread_index = thread_index; + acl_fa_conn_list_delete_session(am, sess_id); + acl_fa_delete_session(am, sw_if_index, sess_id); + } +} + +static fa_session_t * +acl_fa_add_session (acl_main_t * am, int is_input, u32 sw_if_index, u64 now, + fa_5tuple_t * p5tuple) +{ + clib_bihash_kv_40_8_t *pkv = &p5tuple->kv; + clib_bihash_kv_40_8_t kv; + fa_full_session_id_t f_sess_id; + uword thread_index = os_get_thread_index(); + void *oldheap = clib_mem_set_heap(am->acl_mheap); + acl_fa_per_worker_data_t *pw = &am->per_worker_data[thread_index]; + + f_sess_id.thread_index = thread_index; + fa_session_t *sess; + + pool_get_aligned (pw->fa_sessions_pool, sess, CLIB_CACHE_LINE_BYTES); + f_sess_id.session_index = sess - pw->fa_sessions_pool; + + kv.key[0] = pkv->key[0]; + kv.key[1] = pkv->key[1]; + kv.key[2] = pkv->key[2]; + kv.key[3] = pkv->key[3]; + kv.key[4] = pkv->key[4]; + kv.value = f_sess_id.as_u64; + + memcpy (sess, pkv, sizeof (pkv->key)); + sess->last_active_time = now; + sess->sw_if_index = sw_if_index; + sess->tcp_flags_seen.as_u16 = 0; + sess->thread_index = thread_index; + sess->link_list_id = ~0; + sess->link_prev_idx = ~0; + sess->link_next_idx = ~0; + + + + ASSERT(am->fa_sessions_hash_is_initialized == 1); + BV (clib_bihash_add_del) (&am->fa_sessions_hash, + &kv, 1); + acl_fa_conn_list_add_session(am, f_sess_id, now); + + vec_validate (pw->fa_session_adds_by_sw_if_index, sw_if_index); + clib_mem_set_heap (oldheap); + pw->fa_session_adds_by_sw_if_index[sw_if_index]++; + clib_smp_atomic_add(&am->fa_session_total_adds, 1); + return sess; +} + +static int +acl_fa_find_session (acl_main_t * am, u32 sw_if_index0, fa_5tuple_t * p5tuple, + clib_bihash_kv_40_8_t * pvalue_sess) +{ + return (BV (clib_bihash_search) + (&am->fa_sessions_hash, &p5tuple->kv, + pvalue_sess) == 0); +} + + +always_inline uword +acl_fa_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame, int is_ip6, + int is_input, int is_l2_path, u32 * l2_feat_next_node_index, + vlib_node_registration_t * acl_fa_node) +{ + u32 n_left_from, *from, *to_next; + acl_fa_next_t next_index; + u32 pkts_acl_checked = 0; + u32 pkts_new_session = 0; + u32 pkts_exist_session = 0; + u32 pkts_acl_permit = 0; + u32 pkts_restart_session_timer = 0; + u32 trace_bitmap = 0; + acl_main_t *am = &acl_main; + fa_5tuple_t fa_5tuple, kv_sess; + clib_bihash_kv_40_8_t value_sess; + vlib_node_runtime_t *error_node; + u64 now = clib_cpu_time_now (); + uword thread_index = os_get_thread_index (); + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + error_node = vlib_node_get_runtime (vm, acl_fa_node->index); + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + u32 next0 = 0; + u8 action = 0; + u32 sw_if_index0; + int acl_check_needed = 1; + u32 match_acl_in_index = ~0; + u32 match_rule_index = ~0; + u8 error0 = 0; + + /* speculatively enqueue b0 to the current next frame */ + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + if (is_input) + sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX]; + else + sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_TX]; + + /* + * Extract the L3/L4 matching info into a 5-tuple structure, + * then create a session key whose layout is independent on forward or reverse + * direction of the packet. + */ + + acl_fill_5tuple (am, b0, is_ip6, is_input, is_l2_path, &fa_5tuple); + fa_5tuple.l4.lsb_of_sw_if_index = sw_if_index0 & 0xffff; + acl_make_5tuple_session_key (is_input, &fa_5tuple, &kv_sess); + fa_5tuple.pkt.sw_if_index = sw_if_index0; + fa_5tuple.pkt.is_ip6 = is_ip6; + fa_5tuple.pkt.is_input = is_input; + fa_5tuple.pkt.mask_type_index_lsb = ~0; +#ifdef FA_NODE_VERBOSE_DEBUG + clib_warning + ("ACL_FA_NODE_DBG: session 5-tuple %016llx %016llx %016llx %016llx %016llx : %016llx", + kv_sess.kv.key[0], kv_sess.kv.key[1], kv_sess.kv.key[2], + kv_sess.kv.key[3], kv_sess.kv.key[4], kv_sess.kv.value); + clib_warning + ("ACL_FA_NODE_DBG: packet 5-tuple %016llx %016llx %016llx %016llx %016llx : %016llx", + fa_5tuple.kv.key[0], fa_5tuple.kv.key[1], fa_5tuple.kv.key[2], + fa_5tuple.kv.key[3], fa_5tuple.kv.key[4], fa_5tuple.kv.value); +#endif + + /* Try to match an existing session first */ + + if (acl_fa_ifc_has_sessions (am, sw_if_index0)) + { + if (acl_fa_find_session + (am, sw_if_index0, &kv_sess, &value_sess)) + { + trace_bitmap |= 0x80000000; + error0 = ACL_FA_ERROR_ACL_EXIST_SESSION; + fa_full_session_id_t f_sess_id; + + f_sess_id.as_u64 = value_sess.value; + ASSERT(f_sess_id.thread_index < vec_len(vlib_mains)); + + fa_session_t *sess = get_session_ptr(am, f_sess_id.thread_index, f_sess_id.session_index); + int old_timeout_type = + fa_session_get_timeout_type (am, sess); + action = + acl_fa_track_session (am, is_input, sw_if_index0, now, + sess, &fa_5tuple); + /* expose the session id to the tracer */ + match_rule_index = f_sess_id.session_index; + int new_timeout_type = + fa_session_get_timeout_type (am, sess); + acl_check_needed = 0; + pkts_exist_session += 1; + /* Tracking might have changed the session timeout type, e.g. from transient to established */ + if (PREDICT_FALSE (old_timeout_type != new_timeout_type)) + { + acl_fa_restart_timer_for_session (am, now, f_sess_id); + pkts_restart_session_timer++; + trace_bitmap |= + 0x00010000 + ((0xff & old_timeout_type) << 8) + + (0xff & new_timeout_type); + } + /* + * I estimate the likelihood to be very low - the VPP needs + * to have >64K interfaces to start with and then on + * exactly 64K indices apart needs to be exactly the same + * 5-tuple... Anyway, since this probability is nonzero - + * print an error and drop the unlucky packet. + * If this shows up in real world, we would need to bump + * the hash key length. + */ + if (PREDICT_FALSE(sess->sw_if_index != sw_if_index0)) { + clib_warning("BUG: session LSB16(sw_if_index) and 5-tuple collision!"); + acl_check_needed = 0; + action = 0; + } + } + } + + if (acl_check_needed) + { + action = + multi_acl_match_5tuple (sw_if_index0, &fa_5tuple, is_l2_path, + is_ip6, is_input, &match_acl_in_index, + &match_rule_index, &trace_bitmap); + error0 = action; + if (1 == action) + pkts_acl_permit += 1; + if (2 == action) + { + if (!acl_fa_can_add_session (am, is_input, sw_if_index0)) + acl_fa_try_recycle_session (am, is_input, thread_index, sw_if_index0); + + if (acl_fa_can_add_session (am, is_input, sw_if_index0)) + { + fa_session_t *sess = acl_fa_add_session (am, is_input, sw_if_index0, now, + &kv_sess); + acl_fa_track_session (am, is_input, sw_if_index0, now, + sess, &fa_5tuple); + pkts_new_session += 1; + } + else + { + action = 0; + error0 = ACL_FA_ERROR_ACL_TOO_MANY_SESSIONS; + } + } + } + + + + if (action > 0) + { + if (is_l2_path) + next0 = vnet_l2_feature_next (b0, l2_feat_next_node_index, 0); + else + vnet_feature_next (sw_if_index0, &next0, b0); + } + + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) + && (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + acl_fa_trace_t *t = vlib_add_trace (vm, node, b0, sizeof (*t)); + t->sw_if_index = sw_if_index0; + t->next_index = next0; + t->match_acl_in_index = match_acl_in_index; + t->match_rule_index = match_rule_index; + t->packet_info[0] = fa_5tuple.kv.key[0]; + t->packet_info[1] = fa_5tuple.kv.key[1]; + t->packet_info[2] = fa_5tuple.kv.key[2]; + t->packet_info[3] = fa_5tuple.kv.key[3]; + t->packet_info[4] = fa_5tuple.kv.key[4]; + t->packet_info[5] = fa_5tuple.kv.value; + t->action = action; + t->trace_bitmap = trace_bitmap; + } + + next0 = next0 < node->n_next_nodes ? next0 : 0; + if (0 == next0) + b0->error = error_node->errors[error0]; + + pkts_acl_checked += 1; + + /* verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, bi0, + next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + vlib_node_increment_counter (vm, acl_fa_node->index, + ACL_FA_ERROR_ACL_CHECK, pkts_acl_checked); + vlib_node_increment_counter (vm, acl_fa_node->index, + ACL_FA_ERROR_ACL_PERMIT, pkts_acl_permit); + vlib_node_increment_counter (vm, acl_fa_node->index, + ACL_FA_ERROR_ACL_NEW_SESSION, + pkts_new_session); + vlib_node_increment_counter (vm, acl_fa_node->index, + ACL_FA_ERROR_ACL_EXIST_SESSION, + pkts_exist_session); + vlib_node_increment_counter (vm, acl_fa_node->index, + ACL_FA_ERROR_ACL_RESTART_SESSION_TIMER, + pkts_restart_session_timer); + return frame->n_vectors; +} + + +vlib_node_registration_t acl_in_l2_ip6_node; +static uword +acl_in_ip6_l2_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + acl_main_t *am = &acl_main; + return acl_fa_node_fn (vm, node, frame, 1, 1, 1, + am->fa_acl_in_ip6_l2_node_feat_next_node_index, + &acl_in_l2_ip6_node); +} + +vlib_node_registration_t acl_in_l2_ip4_node; +static uword +acl_in_ip4_l2_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + acl_main_t *am = &acl_main; + return acl_fa_node_fn (vm, node, frame, 0, 1, 1, + am->fa_acl_in_ip4_l2_node_feat_next_node_index, + &acl_in_l2_ip4_node); +} + +vlib_node_registration_t acl_out_l2_ip6_node; +static uword +acl_out_ip6_l2_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + acl_main_t *am = &acl_main; + return acl_fa_node_fn (vm, node, frame, 1, 0, 1, + am->fa_acl_out_ip6_l2_node_feat_next_node_index, + &acl_out_l2_ip6_node); +} + +vlib_node_registration_t acl_out_l2_ip4_node; +static uword +acl_out_ip4_l2_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + acl_main_t *am = &acl_main; + return acl_fa_node_fn (vm, node, frame, 0, 0, 1, + am->fa_acl_out_ip4_l2_node_feat_next_node_index, + &acl_out_l2_ip4_node); +} + + +/**** L3 processing path nodes ****/ + + +vlib_node_registration_t acl_in_fa_ip6_node; +static uword +acl_in_ip6_fa_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + return acl_fa_node_fn (vm, node, frame, 1, 1, 0, 0, &acl_in_fa_ip6_node); +} + +vlib_node_registration_t acl_in_fa_ip4_node; +static uword +acl_in_ip4_fa_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + return acl_fa_node_fn (vm, node, frame, 0, 1, 0, 0, &acl_in_fa_ip4_node); +} + +vlib_node_registration_t acl_out_fa_ip6_node; +static uword +acl_out_ip6_fa_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + return acl_fa_node_fn (vm, node, frame, 1, 0, 0, 0, &acl_out_fa_ip6_node); +} + +vlib_node_registration_t acl_out_fa_ip4_node; +static uword +acl_out_ip4_fa_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + return acl_fa_node_fn (vm, node, frame, 0, 0, 0, 0, &acl_out_fa_ip4_node); +} + +/* + * This process ensures the connection cleanup happens every so often + * even in absence of traffic, as well as provides general orchestration + * for requests like connection deletion on a given sw_if_index. + */ + + +/* *INDENT-OFF* */ +#define foreach_acl_fa_cleaner_error \ +_(UNKNOWN_EVENT, "unknown event received") \ +/* end of errors */ + +typedef enum +{ +#define _(sym,str) ACL_FA_CLEANER_ERROR_##sym, + foreach_acl_fa_cleaner_error +#undef _ + ACL_FA_CLEANER_N_ERROR, +} acl_fa_cleaner_error_t; + +static char *acl_fa_cleaner_error_strings[] = { +#define _(sym,string) string, + foreach_acl_fa_cleaner_error +#undef _ +}; + +/* *INDENT-ON* */ + +static vlib_node_registration_t acl_fa_session_cleaner_process_node; +static vlib_node_registration_t acl_fa_worker_session_cleaner_process_node; + +/* + * Per-worker thread interrupt-driven cleaner thread + * to clean idle connections if there are no packets + */ +static uword +acl_fa_worker_conn_cleaner_process(vlib_main_t * vm, + vlib_node_runtime_t * rt, vlib_frame_t * f) +{ + acl_main_t *am = &acl_main; + u64 now = clib_cpu_time_now (); + u16 thread_index = os_get_thread_index (); + acl_fa_per_worker_data_t *pw = &am->per_worker_data[thread_index]; + int num_expired; +#ifdef FA_NODE_VERBOSE_DEBUG + clib_warning("\nacl_fa_worker_conn_cleaner: thread index %d now %lu\n\n", thread_index, now); +#endif + /* allow another interrupt to be queued */ + pw->interrupt_is_pending = 0; + if (pw->clear_in_process) { + if (0 == pw->swipe_end_time) { + /* + * Someone has just set the flag to start clearing. + * we do this by combing through the connections up to a "time T" + * which is now, and requeueing everything except the expired + * connections and those matching the interface(s) being cleared. + */ + + /* + * first filter the sw_if_index bitmap that they want from us, by + * a bitmap of sw_if_index for which we actually have connections. + */ + if ((pw->pending_clear_sw_if_index_bitmap == 0) + || (pw->serviced_sw_if_index_bitmap == 0)) { +#ifdef FA_NODE_VERBOSE_DEBUG + clib_warning("WORKER-CLEAR: someone tried to call clear, but one of the bitmaps are empty"); +#endif + clib_bitmap_zero(pw->pending_clear_sw_if_index_bitmap); + } else { +#ifdef FA_NODE_VERBOSE_DEBUG + clib_warning("WORKER-CLEAR: (before and) swiping sw-if-index bitmap: %U, my serviced bitmap %U", + format_bitmap_hex, pw->pending_clear_sw_if_index_bitmap, + format_bitmap_hex, pw->serviced_sw_if_index_bitmap); +#endif + pw->pending_clear_sw_if_index_bitmap = clib_bitmap_and(pw->pending_clear_sw_if_index_bitmap, + pw->serviced_sw_if_index_bitmap); + } + + if (clib_bitmap_is_zero(pw->pending_clear_sw_if_index_bitmap)) { + /* if the cross-section is a zero vector, no need to do anything. */ +#ifdef FA_NODE_VERBOSE_DEBUG + clib_warning("WORKER: clearing done - nothing to do"); +#endif + pw->clear_in_process = 0; + } else { +#ifdef FA_NODE_VERBOSE_DEBUG + clib_warning("WORKER-CLEAR: swiping sw-if-index bitmap: %U, my serviced bitmap %U", + format_bitmap_hex, pw->pending_clear_sw_if_index_bitmap, + format_bitmap_hex, pw->serviced_sw_if_index_bitmap); +#endif + /* swipe through the connection lists until enqueue timestamps become above "now" */ + pw->swipe_end_time = now; + } + } + } + num_expired = acl_fa_check_idle_sessions(am, thread_index, now); + // clib_warning("WORKER-CLEAR: checked %d sessions (clear_in_progress: %d)", num_expired, pw->clear_in_process); + if (pw->clear_in_process) { + if (0 == num_expired) { + /* we were clearing but we could not process any more connections. time to stop. */ + clib_bitmap_zero(pw->pending_clear_sw_if_index_bitmap); + pw->clear_in_process = 0; +#ifdef FA_NODE_VERBOSE_DEBUG + clib_warning("WORKER: clearing done, all done"); +#endif + } else { +#ifdef FA_NODE_VERBOSE_DEBUG + clib_warning("WORKER-CLEAR: more work to do, raising interrupt"); +#endif + /* should continue clearing.. So could they please sent an interrupt again? */ + pw->interrupt_is_needed = 1; + } + } else { + if (num_expired >= am->fa_max_deleted_sessions_per_interval) { + /* there was too much work, we should get an interrupt ASAP */ + pw->interrupt_is_needed = 1; + pw->interrupt_is_unwanted = 0; + } else if (num_expired <= am->fa_min_deleted_sessions_per_interval) { + /* signal that they should trigger us less */ + pw->interrupt_is_needed = 0; + pw->interrupt_is_unwanted = 1; + } else { + /* the current rate of interrupts is ok */ + pw->interrupt_is_needed = 0; + pw->interrupt_is_unwanted = 0; + } + } + pw->interrupt_generation = am->fa_interrupt_generation; + return 0; +} + +static void +send_one_worker_interrupt (vlib_main_t * vm, acl_main_t *am, int thread_index) +{ + acl_fa_per_worker_data_t *pw = &am->per_worker_data[thread_index]; + if (!pw->interrupt_is_pending) { + pw->interrupt_is_pending = 1; + vlib_node_set_interrupt_pending (vlib_mains[thread_index], + acl_fa_worker_session_cleaner_process_node.index); + /* if the interrupt was requested, mark that done. */ + /* pw->interrupt_is_needed = 0; */ + } +} + +static void +send_interrupts_to_workers (vlib_main_t * vm, acl_main_t *am) +{ + int i; + /* Can't use vec_len(am->per_worker_data) since the threads might not have come up yet; */ + int n_threads = vec_len(vlib_mains); + for (i = n_threads > 1 ? 1 : 0; i < n_threads; i++) { + send_one_worker_interrupt(vm, am, i); + } +} + +/* centralized process to drive per-worker cleaners */ +static uword +acl_fa_session_cleaner_process (vlib_main_t * vm, vlib_node_runtime_t * rt, + vlib_frame_t * f) +{ + acl_main_t *am = &acl_main; + u64 now; + f64 cpu_cps = vm->clib_time.clocks_per_second; + u64 next_expire; + /* We should check if there are connections to clean up - at least twice a second */ + u64 max_timer_wait_interval = cpu_cps / 2; + uword event_type, *event_data = 0; + acl_fa_per_worker_data_t *pw0; + + am->fa_current_cleaner_timer_wait_interval = max_timer_wait_interval; + am->fa_cleaner_node_index = acl_fa_session_cleaner_process_node.index; + am->fa_interrupt_generation = 1; + while (1) + { + now = clib_cpu_time_now (); + next_expire = now + am->fa_current_cleaner_timer_wait_interval; + int has_pending_conns = 0; + u16 ti; + u8 tt; + + /* + * walk over all per-thread list heads of different timeouts, + * and see if there are any connections pending. + * If there aren't - we do not need to wake up until the + * worker code signals that it has added a connection. + * + * Also, while we are at it, calculate the earliest we need to wake up. + */ + for(ti = 0; ti < vec_len(vlib_mains); ti++) { + if (ti >= vec_len(am->per_worker_data)) { + continue; + } + acl_fa_per_worker_data_t *pw = &am->per_worker_data[ti]; + for(tt = 0; tt < vec_len(pw->fa_conn_list_head); tt++) { + u64 head_expiry = acl_fa_get_list_head_expiry_time(am, pw, now, ti, tt); + if ((head_expiry < next_expire) && !pw->interrupt_is_pending) { +#ifdef FA_NODE_VERBOSE_DEBUG + clib_warning("Head expiry: %lu, now: %lu, next_expire: %lu (worker: %d, tt: %d)", head_expiry, now, next_expire, ti, tt); +#endif + next_expire = head_expiry; + } + if (~0 != pw->fa_conn_list_head[tt]) { + has_pending_conns = 1; + } + } + } + + /* If no pending connections and no ACL applied then no point in timing out */ + if (!has_pending_conns && (0 == am->fa_total_enabled_count)) + { + am->fa_cleaner_cnt_wait_without_timeout++; + (void) vlib_process_wait_for_event (vm); + event_type = vlib_process_get_events (vm, &event_data); + } + else + { + f64 timeout = ((i64) next_expire - (i64) now) / cpu_cps; + if (timeout <= 0) + { + /* skip waiting altogether */ + event_type = ~0; + } + else + { + am->fa_cleaner_cnt_wait_with_timeout++; + (void) vlib_process_wait_for_event_or_clock (vm, timeout); + event_type = vlib_process_get_events (vm, &event_data); + } + } + + switch (event_type) + { + case ~0: + /* nothing to do */ + break; + case ACL_FA_CLEANER_RESCHEDULE: + /* Nothing to do. */ + break; + case ACL_FA_CLEANER_DELETE_BY_SW_IF_INDEX: + { + uword *clear_sw_if_index_bitmap = 0; + uword *sw_if_index0; + int clear_all = 0; +#ifdef FA_NODE_VERBOSE_DEBUG + clib_warning("ACL_FA_CLEANER_DELETE_BY_SW_IF_INDEX received"); +#endif + vec_foreach (sw_if_index0, event_data) + { + am->fa_cleaner_cnt_delete_by_sw_index++; +#ifdef FA_NODE_VERBOSE_DEBUG + clib_warning + ("ACL_FA_NODE_CLEAN: ACL_FA_CLEANER_DELETE_BY_SW_IF_INDEX: %d", + *sw_if_index0); +#endif + if (*sw_if_index0 == ~0) + { + clear_all = 1; + } + else + { + if (!pool_is_free_index (am->vnet_main->interface_main.sw_interfaces, *sw_if_index0)) + { + clear_sw_if_index_bitmap = clib_bitmap_set(clear_sw_if_index_bitmap, *sw_if_index0, 1); + } + } + } +#ifdef FA_NODE_VERBOSE_DEBUG + clib_warning("ACL_FA_CLEANER_DELETE_BY_SW_IF_INDEX bitmap: %U", format_bitmap_hex, clear_sw_if_index_bitmap); +#endif + vec_foreach(pw0, am->per_worker_data) { + CLIB_MEMORY_BARRIER (); + while (pw0->clear_in_process) { + CLIB_MEMORY_BARRIER (); +#ifdef FA_NODE_VERBOSE_DEBUG + clib_warning("ACL_FA_NODE_CLEAN: waiting previous cleaning cycle to finish on %d...", pw0 - am->per_worker_data); +#endif + vlib_process_suspend(vm, 0.0001); + if (pw0->interrupt_is_needed) { + send_one_worker_interrupt(vm, am, (pw0 - am->per_worker_data)); + } + } + if (pw0->clear_in_process) { + clib_warning("ERROR-BUG! Could not initiate cleaning on worker because another cleanup in progress"); + } else { + if (clear_all) + { + /* if we need to clear all, then just clear the interfaces that we are servicing */ + pw0->pending_clear_sw_if_index_bitmap = clib_bitmap_dup(pw0->serviced_sw_if_index_bitmap); + } + else + { + pw0->pending_clear_sw_if_index_bitmap = clib_bitmap_dup(clear_sw_if_index_bitmap); + } + pw0->clear_in_process = 1; + } + } + /* send some interrupts so they can start working */ + send_interrupts_to_workers(vm, am); + + /* now wait till they all complete */ +#ifdef FA_NODE_VERBOSE_DEBUG + clib_warning("CLEANER mains len: %d per-worker len: %d", vec_len(vlib_mains), vec_len(am->per_worker_data)); +#endif + vec_foreach(pw0, am->per_worker_data) { + CLIB_MEMORY_BARRIER (); + while (pw0->clear_in_process) { + CLIB_MEMORY_BARRIER (); +#ifdef FA_NODE_VERBOSE_DEBUG + clib_warning("ACL_FA_NODE_CLEAN: waiting for my cleaning cycle to finish on %d...", pw0 - am->per_worker_data); +#endif + vlib_process_suspend(vm, 0.0001); + if (pw0->interrupt_is_needed) { + send_one_worker_interrupt(vm, am, (pw0 - am->per_worker_data)); + } + } + } +#ifdef FA_NODE_VERBOSE_DEBUG + clib_warning("ACL_FA_NODE_CLEAN: cleaning done"); +#endif + clib_bitmap_free(clear_sw_if_index_bitmap); + } + break; + default: +#ifdef FA_NODE_VERBOSE_DEBUG + clib_warning ("ACL plugin connection cleaner: unknown event %u", + event_type); +#endif + vlib_node_increment_counter (vm, + acl_fa_session_cleaner_process_node. + index, + ACL_FA_CLEANER_ERROR_UNKNOWN_EVENT, 1); + am->fa_cleaner_cnt_unknown_event++; + break; + } + + send_interrupts_to_workers(vm, am); + + if (event_data) + _vec_len (event_data) = 0; + + /* + * If the interrupts were not processed yet, ensure we wait a bit, + * but up to a point. + */ + int need_more_wait = 0; + int max_wait_cycles = 100; + do { + need_more_wait = 0; + vec_foreach(pw0, am->per_worker_data) { + if (pw0->interrupt_generation != am->fa_interrupt_generation) { + need_more_wait = 1; + } + } + if (need_more_wait) { + vlib_process_suspend(vm, 0.0001); + } + } while (need_more_wait && (--max_wait_cycles > 0)); + + int interrupts_needed = 0; + int interrupts_unwanted = 0; + + vec_foreach(pw0, am->per_worker_data) { + if (pw0->interrupt_is_needed) { + interrupts_needed++; + /* the per-worker value is reset when sending the interrupt */ + } + if (pw0->interrupt_is_unwanted) { + interrupts_unwanted++; + pw0->interrupt_is_unwanted = 0; + } + } + if (interrupts_needed) { + /* they need more interrupts, do less waiting around next time */ + am->fa_current_cleaner_timer_wait_interval /= 2; + /* never go into zero-wait either though - we need to give the space to others */ + am->fa_current_cleaner_timer_wait_interval += 1; + } else if (interrupts_unwanted) { + /* slowly increase the amount of sleep up to a limit */ + if (am->fa_current_cleaner_timer_wait_interval < max_timer_wait_interval) + am->fa_current_cleaner_timer_wait_interval += cpu_cps * am->fa_cleaner_wait_time_increment; + } + am->fa_cleaner_cnt_event_cycles++; + am->fa_interrupt_generation++; + } + /* NOT REACHED */ + return 0; +} + + +void +acl_fa_enable_disable (u32 sw_if_index, int is_input, int enable_disable) +{ + acl_main_t *am = &acl_main; + if (enable_disable) { + acl_fa_verify_init_sessions(am); + am->fa_total_enabled_count++; + void *oldheap = clib_mem_set_heap (am->vlib_main->heap_base); + vlib_process_signal_event (am->vlib_main, am->fa_cleaner_node_index, + ACL_FA_CLEANER_RESCHEDULE, 0); + clib_mem_set_heap (oldheap); + } else { + am->fa_total_enabled_count--; + } + + if (is_input) + { + ASSERT(clib_bitmap_get(am->fa_in_acl_on_sw_if_index, sw_if_index) != enable_disable); + void *oldheap = clib_mem_set_heap (am->vlib_main->heap_base); + vnet_feature_enable_disable ("ip4-unicast", "acl-plugin-in-ip4-fa", + sw_if_index, enable_disable, 0, 0); + vnet_feature_enable_disable ("ip6-unicast", "acl-plugin-in-ip6-fa", + sw_if_index, enable_disable, 0, 0); + clib_mem_set_heap (oldheap); + am->fa_in_acl_on_sw_if_index = + clib_bitmap_set (am->fa_in_acl_on_sw_if_index, sw_if_index, + enable_disable); + } + else + { + ASSERT(clib_bitmap_get(am->fa_out_acl_on_sw_if_index, sw_if_index) != enable_disable); + void *oldheap = clib_mem_set_heap (am->vlib_main->heap_base); + vnet_feature_enable_disable ("ip4-output", "acl-plugin-out-ip4-fa", + sw_if_index, enable_disable, 0, 0); + vnet_feature_enable_disable ("ip6-output", "acl-plugin-out-ip6-fa", + sw_if_index, enable_disable, 0, 0); + clib_mem_set_heap (oldheap); + am->fa_out_acl_on_sw_if_index = + clib_bitmap_set (am->fa_out_acl_on_sw_if_index, sw_if_index, + enable_disable); + } + if ((!enable_disable) && (!acl_fa_ifc_has_in_acl (am, sw_if_index)) + && (!acl_fa_ifc_has_out_acl (am, sw_if_index))) + { +#ifdef FA_NODE_VERBOSE_DEBUG + clib_warning("ENABLE-DISABLE: clean the connections on interface %d", sw_if_index); +#endif + void *oldheap = clib_mem_set_heap (am->vlib_main->heap_base); + vlib_process_signal_event (am->vlib_main, am->fa_cleaner_node_index, + ACL_FA_CLEANER_DELETE_BY_SW_IF_INDEX, + sw_if_index); + clib_mem_set_heap (oldheap); + } +} + +void +show_fa_sessions_hash(vlib_main_t * vm, u32 verbose) +{ + acl_main_t *am = &acl_main; + if (am->fa_sessions_hash_is_initialized) { + vlib_cli_output(vm, "\nSession lookup hash table:\n%U\n\n", + BV (format_bihash), &am->fa_sessions_hash, verbose); + } else { + vlib_cli_output(vm, "\nSession lookup hash table is not allocated.\n\n"); + } +} + + +/* *INDENT-OFF* */ + +VLIB_REGISTER_NODE (acl_fa_worker_session_cleaner_process_node, static) = { + .function = acl_fa_worker_conn_cleaner_process, + .name = "acl-plugin-fa-worker-cleaner-process", + .type = VLIB_NODE_TYPE_INPUT, + .state = VLIB_NODE_STATE_INTERRUPT, +}; + +VLIB_REGISTER_NODE (acl_fa_session_cleaner_process_node, static) = { + .function = acl_fa_session_cleaner_process, + .type = VLIB_NODE_TYPE_PROCESS, + .name = "acl-plugin-fa-cleaner-process", + .n_errors = ARRAY_LEN (acl_fa_cleaner_error_strings), + .error_strings = acl_fa_cleaner_error_strings, + .n_next_nodes = 0, + .next_nodes = {}, +}; + + +VLIB_REGISTER_NODE (acl_in_l2_ip6_node) = +{ + .function = acl_in_ip6_l2_node_fn, + .name = "acl-plugin-in-ip6-l2", + .vector_size = sizeof (u32), + .format_trace = format_acl_fa_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN (acl_fa_error_strings), + .error_strings = acl_fa_error_strings, + .n_next_nodes = ACL_FA_N_NEXT, + .next_nodes = + { + [ACL_FA_ERROR_DROP] = "error-drop", + } +}; + +VLIB_REGISTER_NODE (acl_in_l2_ip4_node) = +{ + .function = acl_in_ip4_l2_node_fn, + .name = "acl-plugin-in-ip4-l2", + .vector_size = sizeof (u32), + .format_trace = format_acl_fa_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN (acl_fa_error_strings), + .error_strings = acl_fa_error_strings, + .n_next_nodes = ACL_FA_N_NEXT, + .next_nodes = + { + [ACL_FA_ERROR_DROP] = "error-drop", + } +}; + +VLIB_REGISTER_NODE (acl_out_l2_ip6_node) = +{ + .function = acl_out_ip6_l2_node_fn, + .name = "acl-plugin-out-ip6-l2", + .vector_size = sizeof (u32), + .format_trace = format_acl_fa_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN (acl_fa_error_strings), + .error_strings = acl_fa_error_strings, + .n_next_nodes = ACL_FA_N_NEXT, + .next_nodes = + { + [ACL_FA_ERROR_DROP] = "error-drop", + } +}; + +VLIB_REGISTER_NODE (acl_out_l2_ip4_node) = +{ + .function = acl_out_ip4_l2_node_fn, + .name = "acl-plugin-out-ip4-l2", + .vector_size = sizeof (u32), + .format_trace = format_acl_fa_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN (acl_fa_error_strings), + .error_strings = acl_fa_error_strings, + .n_next_nodes = ACL_FA_N_NEXT, + .next_nodes = + { + [ACL_FA_ERROR_DROP] = "error-drop", + } +}; + + +VLIB_REGISTER_NODE (acl_in_fa_ip6_node) = +{ + .function = acl_in_ip6_fa_node_fn, + .name = "acl-plugin-in-ip6-fa", + .vector_size = sizeof (u32), + .format_trace = format_acl_fa_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN (acl_fa_error_strings), + .error_strings = acl_fa_error_strings, + .n_next_nodes = ACL_FA_N_NEXT, + .next_nodes = + { + [ACL_FA_ERROR_DROP] = "error-drop", + } +}; + +VNET_FEATURE_INIT (acl_in_ip6_fa_feature, static) = +{ + .arc_name = "ip6-unicast", + .node_name = "acl-plugin-in-ip6-fa", + .runs_before = VNET_FEATURES ("ip6-flow-classify"), +}; + +VLIB_REGISTER_NODE (acl_in_fa_ip4_node) = +{ + .function = acl_in_ip4_fa_node_fn, + .name = "acl-plugin-in-ip4-fa", + .vector_size = sizeof (u32), + .format_trace = format_acl_fa_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN (acl_fa_error_strings), + .error_strings = acl_fa_error_strings, + .n_next_nodes = ACL_FA_N_NEXT, + .next_nodes = + { + [ACL_FA_ERROR_DROP] = "error-drop", + } +}; + +VNET_FEATURE_INIT (acl_in_ip4_fa_feature, static) = +{ + .arc_name = "ip4-unicast", + .node_name = "acl-plugin-in-ip4-fa", + .runs_before = VNET_FEATURES ("ip4-flow-classify"), +}; + + +VLIB_REGISTER_NODE (acl_out_fa_ip6_node) = +{ + .function = acl_out_ip6_fa_node_fn, + .name = "acl-plugin-out-ip6-fa", + .vector_size = sizeof (u32), + .format_trace = format_acl_fa_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN (acl_fa_error_strings), + .error_strings = acl_fa_error_strings, + .n_next_nodes = ACL_FA_N_NEXT, + .next_nodes = + { + [ACL_FA_ERROR_DROP] = "error-drop", + } +}; + +VNET_FEATURE_INIT (acl_out_ip6_fa_feature, static) = +{ + .arc_name = "ip6-output", + .node_name = "acl-plugin-out-ip6-fa", + .runs_before = VNET_FEATURES ("interface-output"), +}; + +VLIB_REGISTER_NODE (acl_out_fa_ip4_node) = +{ + .function = acl_out_ip4_fa_node_fn, + .name = "acl-plugin-out-ip4-fa", + .vector_size = sizeof (u32), + .format_trace = format_acl_fa_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN (acl_fa_error_strings), + .error_strings = acl_fa_error_strings, + .n_next_nodes = ACL_FA_N_NEXT, + /* edit / add dispositions here */ + .next_nodes = + { + [ACL_FA_ERROR_DROP] = "error-drop", + } +}; + +VNET_FEATURE_INIT (acl_out_ip4_fa_feature, static) = +{ + .arc_name = "ip4-output", + .node_name = "acl-plugin-out-ip4-fa", + .runs_before = VNET_FEATURES ("interface-output"), +}; + + +/* *INDENT-ON* */ diff --git a/src/plugins/acl/fa_node.h b/src/plugins/acl/fa_node.h new file mode 100644 index 00000000..fa9a2303 --- /dev/null +++ b/src/plugins/acl/fa_node.h @@ -0,0 +1,174 @@ +#ifndef _FA_NODE_H_ +#define _FA_NODE_H_ + +#include <stddef.h> +#include <vppinfra/bihash_40_8.h> + +#define TCP_FLAG_FIN 0x01 +#define TCP_FLAG_SYN 0x02 +#define TCP_FLAG_RST 0x04 +#define TCP_FLAG_PUSH 0x08 +#define TCP_FLAG_ACK 0x10 +#define TCP_FLAG_URG 0x20 +#define TCP_FLAG_ECE 0x40 +#define TCP_FLAG_CWR 0x80 +#define TCP_FLAGS_RSTFINACKSYN (TCP_FLAG_RST + TCP_FLAG_FIN + TCP_FLAG_SYN + TCP_FLAG_ACK) +#define TCP_FLAGS_ACKSYN (TCP_FLAG_SYN + TCP_FLAG_ACK) + +#define ACL_FA_CONN_TABLE_DEFAULT_HASH_NUM_BUCKETS (64 * 1024) +#define ACL_FA_CONN_TABLE_DEFAULT_HASH_MEMORY_SIZE (1<<30) +#define ACL_FA_CONN_TABLE_DEFAULT_MAX_ENTRIES 1000000 + +typedef union { + u64 as_u64; + struct { + u32 sw_if_index; + u16 mask_type_index_lsb; + u8 tcp_flags; + u8 tcp_flags_valid:1; + u8 is_input:1; + u8 l4_valid:1; + u8 is_nonfirst_fragment:1; + u8 is_ip6:1; + u8 flags_reserved:3; + }; +} fa_packet_info_t; + +typedef union { + u64 as_u64; + struct { + u16 port[2]; + u16 proto; + u16 lsb_of_sw_if_index; + }; +} fa_session_l4_key_t; + +typedef union { + struct { + ip46_address_t addr[2]; + fa_session_l4_key_t l4; + /* This field should align with u64 value in bihash_40_8 keyvalue struct */ + fa_packet_info_t pkt; + }; + clib_bihash_kv_40_8_t kv; +} fa_5tuple_t; + + +typedef struct { + fa_5tuple_t info; /* (5+1)*8 = 48 bytes */ + u64 last_active_time; /* +8 bytes = 56 */ + u32 sw_if_index; /* +4 bytes = 60 */ + union { + u8 as_u8[2]; + u16 as_u16; + } tcp_flags_seen; ; /* +2 bytes = 62 */ + u16 thread_index; /* +2 bytes = 64 */ + u64 link_enqueue_time; /* 8 byte = 8 */ + u32 link_prev_idx; /* +4 bytes = 12 */ + u32 link_next_idx; /* +4 bytes = 16 */ + u8 link_list_id; /* +1 bytes = 17 */ + u8 reserved1[7]; /* +7 bytes = 24 */ + u64 reserved2[5]; /* +5*8 bytes = 64 */ +} fa_session_t; + + +/* This structure is used to fill in the u64 value + in the per-sw-if-index hash table */ +typedef struct { + union { + u64 as_u64; + struct { + u32 session_index; + u16 thread_index; + u16 reserved0; + }; + }; +} fa_full_session_id_t; + +/* + * A few compile-time constraints on the size and the layout of the union, to ensure + * it makes sense both for bihash and for us. + */ + +#define CT_ASSERT_EQUAL(name, x,y) typedef int assert_ ## name ## _compile_time_assertion_failed[((x) == (y))-1] +CT_ASSERT_EQUAL(fa_l3_key_size_is_40, offsetof(fa_5tuple_t, pkt), offsetof(clib_bihash_kv_40_8_t, value)); +CT_ASSERT_EQUAL(fa_l4_key_t_is_8, sizeof(fa_session_l4_key_t), sizeof(u64)); +CT_ASSERT_EQUAL(fa_packet_info_t_is_8, sizeof(fa_packet_info_t), sizeof(u64)); +CT_ASSERT_EQUAL(fa_l3_kv_size_is_48, sizeof(fa_5tuple_t), sizeof(clib_bihash_kv_40_8_t)); + +/* Let's try to fit within two cachelines */ +CT_ASSERT_EQUAL(fa_session_t_size_is_128, sizeof(fa_session_t), 128); + +/* Session ID MUST be the same as u64 */ +CT_ASSERT_EQUAL(fa_full_session_id_size_is_64, sizeof(fa_full_session_id_t), sizeof(u64)); +#undef CT_ASSERT_EQUAL + +typedef struct { + /* The pool of sessions managed by this worker */ + fa_session_t *fa_sessions_pool; + /* per-worker ACL_N_TIMEOUTS of conn lists */ + u32 *fa_conn_list_head; + u32 *fa_conn_list_tail; + /* adds and deletes per-worker-per-interface */ + u64 *fa_session_dels_by_sw_if_index; + u64 *fa_session_adds_by_sw_if_index; + /* Vector of expired connections retrieved from lists */ + u32 *expired; + /* the earliest next expiry time */ + u64 next_expiry_time; + /* if not zero, look at all the elements until their enqueue timestamp is after below one */ + u64 requeue_until_time; + /* Current time between the checks */ + u64 current_time_wait_interval; + /* Counter of how many sessions we did delete */ + u64 cnt_deleted_sessions; + /* Counter of already deleted sessions being deleted - should not increment unless a bug */ + u64 cnt_already_deleted_sessions; + /* Number of times we requeued a session to a head of the list */ + u64 cnt_session_timer_restarted; + /* swipe up to this enqueue time, rather than following the timeouts */ + u64 swipe_end_time; + /* bitmap of sw_if_index serviced by this worker */ + uword *serviced_sw_if_index_bitmap; + /* bitmap of sw_if_indices to clear. set by main thread, cleared by worker */ + uword *pending_clear_sw_if_index_bitmap; + /* atomic, indicates that the swipe-deletion of connections is in progress */ + u32 clear_in_process; + /* Interrupt is pending from main thread */ + int interrupt_is_pending; + /* + * Interrupt node on the worker thread sets this if it knows there is + * more work to do, but it has to finish to avoid hogging the + * core for too long. + */ + int interrupt_is_needed; + /* + * Set to indicate that the interrupt node wants to get less interrupts + * because there is not enough work for the current rate. + */ + int interrupt_is_unwanted; + /* + * Set to copy of a "generation" counter in main thread so we can sync the interrupts. + */ + int interrupt_generation; +} acl_fa_per_worker_data_t; + + +typedef enum { + ACL_FA_ERROR_DROP, + ACL_FA_N_NEXT, +} acl_fa_next_t; + + +enum +{ + ACL_FA_CLEANER_RESCHEDULE = 1, + ACL_FA_CLEANER_DELETE_BY_SW_IF_INDEX, +} acl_fa_cleaner_process_event_e; + +void acl_fa_enable_disable(u32 sw_if_index, int is_input, int enable_disable); + +void show_fa_sessions_hash(vlib_main_t * vm, u32 verbose); + + +#endif diff --git a/src/plugins/acl/hash_lookup.c b/src/plugins/acl/hash_lookup.c new file mode 100644 index 00000000..7869027b --- /dev/null +++ b/src/plugins/acl/hash_lookup.c @@ -0,0 +1,894 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#include <stddef.h> +#include <netinet/in.h> + +#include <vlibapi/api.h> +#include <vlibmemory/api.h> +#include <vlibsocket/api.h> + +#include <vlib/vlib.h> +#include <vnet/vnet.h> +#include <vnet/pg/pg.h> +#include <vppinfra/error.h> +#include <vnet/plugin/plugin.h> +#include <acl/acl.h> +#include <vppinfra/bihash_48_8.h> + +#include "hash_lookup.h" +#include "hash_lookup_private.h" + + +static inline applied_hash_ace_entry_t **get_applied_hash_aces(acl_main_t *am, int is_input, u32 sw_if_index) +{ + applied_hash_ace_entry_t **applied_hash_aces = is_input ? vec_elt_at_index(am->input_hash_entry_vec_by_sw_if_index, sw_if_index) + : vec_elt_at_index(am->output_hash_entry_vec_by_sw_if_index, sw_if_index); + return applied_hash_aces; +} + + + +/* + * This returns true if there is indeed a match on the portranges. + * With all these levels of indirections, this is not going to be very fast, + * so, best use the individual ports or wildcard ports for performance. + */ +static int +match_portranges(acl_main_t *am, fa_5tuple_t *match, u32 index) +{ + + applied_hash_ace_entry_t **applied_hash_aces = get_applied_hash_aces(am, match->pkt.is_input, match->pkt.sw_if_index); + applied_hash_ace_entry_t *pae = vec_elt_at_index((*applied_hash_aces), index); + + acl_rule_t *r = &(am->acls[pae->acl_index].rules[pae->ace_index]); + DBG("PORTMATCH: %d <= %d <= %d && %d <= %d <= %d ?", + r->src_port_or_type_first, match->l4.port[0], r->src_port_or_type_last, + r->dst_port_or_code_first, match->l4.port[1], r->dst_port_or_code_last); + + return ( ((r->src_port_or_type_first <= match->l4.port[0]) && r->src_port_or_type_last >= match->l4.port[0]) && + ((r->dst_port_or_code_first <= match->l4.port[1]) && r->dst_port_or_code_last >= match->l4.port[1]) ); +} + +static u32 +multi_acl_match_get_applied_ace_index(acl_main_t *am, fa_5tuple_t *match) +{ + clib_bihash_kv_48_8_t kv; + clib_bihash_kv_48_8_t result; + fa_5tuple_t *kv_key = (fa_5tuple_t *)kv.key; + hash_acl_lookup_value_t *result_val = (hash_acl_lookup_value_t *)&result.value; + u64 *pmatch = (u64 *)match; + u64 *pmask; + u64 *pkey; + int mask_type_index; + u32 curr_match_index = ~0; + + u32 sw_if_index = match->pkt.sw_if_index; + u8 is_input = match->pkt.is_input; + applied_hash_ace_entry_t **applied_hash_aces = get_applied_hash_aces(am, is_input, sw_if_index); + applied_hash_acl_info_t **applied_hash_acls = is_input ? &am->input_applied_hash_acl_info_by_sw_if_index : + &am->output_applied_hash_acl_info_by_sw_if_index; + + DBG("TRYING TO MATCH: %016llx %016llx %016llx %016llx %016llx %016llx", + pmatch[0], pmatch[1], pmatch[2], pmatch[3], pmatch[4], pmatch[5]); + + for(mask_type_index=0; mask_type_index < pool_len(am->ace_mask_type_pool); mask_type_index++) { + if (!clib_bitmap_get(vec_elt_at_index((*applied_hash_acls), sw_if_index)->mask_type_index_bitmap, mask_type_index)) { + /* This bit is not set. Avoid trying to match */ + continue; + } + ace_mask_type_entry_t *mte = vec_elt_at_index(am->ace_mask_type_pool, mask_type_index); + pmatch = (u64 *)match; + pmask = (u64 *)&mte->mask; + pkey = (u64 *)kv.key; + /* + * unrolling the below loop results in a noticeable performance increase. + int i; + for(i=0; i<6; i++) { + kv.key[i] = pmatch[i] & pmask[i]; + } + */ + + *pkey++ = *pmatch++ & *pmask++; + *pkey++ = *pmatch++ & *pmask++; + *pkey++ = *pmatch++ & *pmask++; + *pkey++ = *pmatch++ & *pmask++; + *pkey++ = *pmatch++ & *pmask++; + *pkey++ = *pmatch++ & *pmask++; + + kv_key->pkt.mask_type_index_lsb = mask_type_index; + DBG(" KEY %3d: %016llx %016llx %016llx %016llx %016llx %016llx", mask_type_index, + kv.key[0], kv.key[1], kv.key[2], kv.key[3], kv.key[4], kv.key[5]); + int res = BV (clib_bihash_search) (&am->acl_lookup_hash, &kv, &result); + if (res == 0) { + DBG("ACL-MATCH! result_val: %016llx", result_val->as_u64); + if (result_val->applied_entry_index < curr_match_index) { + if (PREDICT_FALSE(result_val->need_portrange_check)) { + /* + * This is going to be slow, since we can have multiple superset + * entries for narrow-ish portranges, e.g.: + * 0..42 100..400, 230..60000, + * so we need to walk linearly and check if they match. + */ + + u32 curr_index = result_val->applied_entry_index; + while ((curr_index != ~0) && !match_portranges(am, match, curr_index)) { + /* while no match and there are more entries, walk... */ + applied_hash_ace_entry_t *pae = vec_elt_at_index((*applied_hash_aces),curr_index); + DBG("entry %d did not portmatch, advancing to %d", curr_index, pae->next_applied_entry_index); + curr_index = pae->next_applied_entry_index; + } + if (curr_index < curr_match_index) { + DBG("The index %d is the new candidate in portrange matches.", curr_index); + curr_match_index = curr_index; + } else { + DBG("Curr portmatch index %d is too big vs. current matched one %d", curr_index, curr_match_index); + } + } else { + /* The usual path is here. Found an entry in front of the current candiate - so it's a new one */ + DBG("This match is the new candidate"); + curr_match_index = result_val->applied_entry_index; + if (!result_val->shadowed) { + /* new result is known to not be shadowed, so no point to look up further */ + break; + } + } + } + } + } + DBG("MATCH-RESULT: %d", curr_match_index); + return curr_match_index; +} + +static void +hashtable_add_del(acl_main_t *am, clib_bihash_kv_48_8_t *kv, int is_add) +{ + DBG("HASH ADD/DEL: %016llx %016llx %016llx %016llx %016llx %016llx %016llx add %d", + kv->key[0], kv->key[1], kv->key[2], + kv->key[3], kv->key[4], kv->key[5], kv->value, is_add); + BV (clib_bihash_add_del) (&am->acl_lookup_hash, kv, is_add); +} + +static void +fill_applied_hash_ace_kv(acl_main_t *am, + applied_hash_ace_entry_t **applied_hash_aces, + u32 sw_if_index, u8 is_input, + u32 new_index, clib_bihash_kv_48_8_t *kv) +{ + fa_5tuple_t *kv_key = (fa_5tuple_t *)kv->key; + hash_acl_lookup_value_t *kv_val = (hash_acl_lookup_value_t *)&kv->value; + applied_hash_ace_entry_t *pae = vec_elt_at_index((*applied_hash_aces), new_index); + hash_acl_info_t *ha = vec_elt_at_index(am->hash_acl_infos, pae->acl_index); + + memcpy(kv_key, &(vec_elt_at_index(ha->rules, pae->hash_ace_info_index)->match), sizeof(*kv_key)); + /* initialize the sw_if_index and direction */ + kv_key->pkt.sw_if_index = sw_if_index; + kv_key->pkt.is_input = is_input; + kv_val->as_u64 = 0; + kv_val->applied_entry_index = new_index; + kv_val->need_portrange_check = vec_elt_at_index(ha->rules, pae->hash_ace_info_index)->src_portrange_not_powerof2 || + vec_elt_at_index(ha->rules, pae->hash_ace_info_index)->dst_portrange_not_powerof2; + /* by default assume all values are shadowed -> check all mask types */ + kv_val->shadowed = 1; +} + +static void +add_del_hashtable_entry(acl_main_t *am, + u32 sw_if_index, u8 is_input, + applied_hash_ace_entry_t **applied_hash_aces, + u32 index, int is_add) +{ + clib_bihash_kv_48_8_t kv; + + fill_applied_hash_ace_kv(am, applied_hash_aces, sw_if_index, is_input, index, &kv); + hashtable_add_del(am, &kv, is_add); +} + + + +static void +activate_applied_ace_hash_entry(acl_main_t *am, + u32 sw_if_index, u8 is_input, + applied_hash_ace_entry_t **applied_hash_aces, + u32 new_index) +{ + clib_bihash_kv_48_8_t kv; + ASSERT(new_index != ~0); + applied_hash_ace_entry_t *pae = vec_elt_at_index((*applied_hash_aces), new_index); + DBG("activate_applied_ace_hash_entry sw_if_index %d is_input %d new_index %d", sw_if_index, is_input, new_index); + + fill_applied_hash_ace_kv(am, applied_hash_aces, sw_if_index, is_input, new_index, &kv); + + DBG("APPLY ADD KY: %016llx %016llx %016llx %016llx %016llx %016llx", + kv.key[0], kv.key[1], kv.key[2], + kv.key[3], kv.key[4], kv.key[5]); + + clib_bihash_kv_48_8_t result; + hash_acl_lookup_value_t *result_val = (hash_acl_lookup_value_t *)&result.value; + int res = BV (clib_bihash_search) (&am->acl_lookup_hash, &kv, &result); + ASSERT(new_index != ~0); + ASSERT(new_index < vec_len((*applied_hash_aces))); + if (res == 0) { + /* There already exists an entry or more. Append at the end. */ + u32 first_index = result_val->applied_entry_index; + ASSERT(first_index != ~0); + DBG("A key already exists, with applied entry index: %d", first_index); + applied_hash_ace_entry_t *first_pae = vec_elt_at_index((*applied_hash_aces), first_index); + u32 last_index = first_pae->tail_applied_entry_index; + ASSERT(last_index != ~0); + applied_hash_ace_entry_t *last_pae = vec_elt_at_index((*applied_hash_aces), last_index); + DBG("...advance to chained entry index: %d", last_index); + /* link ourseves in */ + last_pae->next_applied_entry_index = new_index; + pae->prev_applied_entry_index = last_index; + /* adjust the pointer to the new tail */ + first_pae->tail_applied_entry_index = new_index; + } else { + /* It's the very first entry */ + hashtable_add_del(am, &kv, 1); + ASSERT(new_index != ~0); + pae->tail_applied_entry_index = new_index; + } +} + +static void +applied_hash_entries_analyze(acl_main_t *am, applied_hash_ace_entry_t **applied_hash_aces) +{ + /* + * Go over the rules and check which ones are shadowed and which aren't. + * Naive approach: try to match the match value from every ACE as if it + * was a live packet, and see if the resulting match happens earlier in the list. + * if it does not match or it is later in the ACL - then the entry is not shadowed. + * + * This approach fails, an example: + * deny tcp 2001:db8::/32 2001:db8::/32 + * permit ip 2001:db8::1/128 2001:db8::2/128 + */ +} + +static void * +hash_acl_set_heap(acl_main_t *am) +{ + if (0 == am->hash_lookup_mheap) { + am->hash_lookup_mheap = mheap_alloc (0 /* use VM */ , am->hash_lookup_mheap_size); + mheap_t *h = mheap_header (am->hash_lookup_mheap); + h->flags |= MHEAP_FLAG_THREAD_SAFE; + } + void *oldheap = clib_mem_set_heap(am->hash_lookup_mheap); + return oldheap; +} + +void +acl_plugin_hash_acl_set_validate_heap(acl_main_t *am, int on) +{ + clib_mem_set_heap(hash_acl_set_heap(am)); + mheap_t *h = mheap_header (am->hash_lookup_mheap); + if (on) { + h->flags |= MHEAP_FLAG_VALIDATE; + h->flags &= ~MHEAP_FLAG_SMALL_OBJECT_CACHE; + mheap_validate(h); + } else { + h->flags &= ~MHEAP_FLAG_VALIDATE; + h->flags |= MHEAP_FLAG_SMALL_OBJECT_CACHE; + } +} + +void +acl_plugin_hash_acl_set_trace_heap(acl_main_t *am, int on) +{ + clib_mem_set_heap(hash_acl_set_heap(am)); + mheap_t *h = mheap_header (am->hash_lookup_mheap); + if (on) { + h->flags |= MHEAP_FLAG_TRACE; + } else { + h->flags &= ~MHEAP_FLAG_TRACE; + } +} + +void +hash_acl_apply(acl_main_t *am, u32 sw_if_index, u8 is_input, int acl_index) +{ + int i; + + DBG0("HASH ACL apply: sw_if_index %d is_input %d acl %d", sw_if_index, is_input, acl_index); + if (!am->acl_lookup_hash_initialized) { + BV (clib_bihash_init) (&am->acl_lookup_hash, "ACL plugin rule lookup bihash", + am->hash_lookup_hash_buckets, am->hash_lookup_hash_memory); + am->acl_lookup_hash_initialized = 1; + } + + void *oldheap = hash_acl_set_heap(am); + if (is_input) { + vec_validate(am->input_hash_entry_vec_by_sw_if_index, sw_if_index); + } else { + vec_validate(am->output_hash_entry_vec_by_sw_if_index, sw_if_index); + } + vec_validate(am->hash_acl_infos, acl_index); + applied_hash_ace_entry_t **applied_hash_aces = get_applied_hash_aces(am, is_input, sw_if_index); + + hash_acl_info_t *ha = vec_elt_at_index(am->hash_acl_infos, acl_index); + u32 **hash_acl_applied_sw_if_index = is_input ? &ha->inbound_sw_if_index_list + : &ha->outbound_sw_if_index_list; + + int base_offset = vec_len(*applied_hash_aces); + + /* Update the bitmap of the mask types with which the lookup + needs to happen for the ACLs applied to this sw_if_index */ + applied_hash_acl_info_t **applied_hash_acls = is_input ? &am->input_applied_hash_acl_info_by_sw_if_index : + &am->output_applied_hash_acl_info_by_sw_if_index; + vec_validate((*applied_hash_acls), sw_if_index); + applied_hash_acl_info_t *pal = vec_elt_at_index((*applied_hash_acls), sw_if_index); + + /* ensure the list of applied hash acls is initialized and add this acl# to it */ + u32 index = vec_search(pal->applied_acls, acl_index); + if (index != ~0) { + clib_warning("BUG: trying to apply twice acl_index %d on sw_if_index %d is_input %d", + acl_index, sw_if_index, is_input); + goto done; + } + vec_add1(pal->applied_acls, acl_index); + u32 index2 = vec_search((*hash_acl_applied_sw_if_index), sw_if_index); + if (index2 != ~0) { + clib_warning("BUG: trying to apply twice acl_index %d on (sw_if_index %d) is_input %d", + acl_index, sw_if_index, is_input); + goto done; + } + vec_add1((*hash_acl_applied_sw_if_index), sw_if_index); + + pal->mask_type_index_bitmap = clib_bitmap_or(pal->mask_type_index_bitmap, + ha->mask_type_index_bitmap); + /* + * if the applied ACL is empty, the current code will cause a + * different behavior compared to current linear search: an empty ACL will + * simply fallthrough to the next ACL, or the default deny in the end. + * + * This is not a problem, because after vpp-dev discussion, + * the consensus was it should not be possible to apply the non-existent + * ACL, so the change adding this code also takes care of that. + */ + + /* expand the applied aces vector by the necessary amount */ + vec_resize((*applied_hash_aces), vec_len(ha->rules)); + + /* add the rules from the ACL to the hash table for lookup and append to the vector*/ + for(i=0; i < vec_len(ha->rules); i++) { + u32 new_index = base_offset + i; + applied_hash_ace_entry_t *pae = vec_elt_at_index((*applied_hash_aces), new_index); + pae->acl_index = acl_index; + pae->ace_index = ha->rules[i].ace_index; + pae->action = ha->rules[i].action; + pae->hitcount = 0; + pae->hash_ace_info_index = i; + /* we might link it in later */ + pae->next_applied_entry_index = ~0; + pae->prev_applied_entry_index = ~0; + pae->tail_applied_entry_index = ~0; + activate_applied_ace_hash_entry(am, sw_if_index, is_input, applied_hash_aces, new_index); + } + applied_hash_entries_analyze(am, applied_hash_aces); +done: + clib_mem_set_heap (oldheap); +} + +static u32 +find_head_applied_ace_index(applied_hash_ace_entry_t **applied_hash_aces, u32 curr_index) +{ + /* + * find back the first entry. Inefficient so might need to be a bit cleverer + * if this proves to be a problem.. + */ + u32 an_index = curr_index; + ASSERT(an_index != ~0); + applied_hash_ace_entry_t *head_pae = vec_elt_at_index((*applied_hash_aces), an_index); + while(head_pae->prev_applied_entry_index != ~0) { + an_index = head_pae->prev_applied_entry_index; + ASSERT(an_index != ~0); + head_pae = vec_elt_at_index((*applied_hash_aces), an_index); + } + return an_index; +} + +static void +move_applied_ace_hash_entry(acl_main_t *am, + u32 sw_if_index, u8 is_input, + applied_hash_ace_entry_t **applied_hash_aces, + u32 old_index, u32 new_index) +{ + ASSERT(old_index != ~0); + ASSERT(new_index != ~0); + /* move the entry */ + *vec_elt_at_index((*applied_hash_aces), new_index) = *vec_elt_at_index((*applied_hash_aces), old_index); + + /* update the linkage and hash table if necessary */ + applied_hash_ace_entry_t *pae = vec_elt_at_index((*applied_hash_aces), old_index); + + if (pae->prev_applied_entry_index != ~0) { + applied_hash_ace_entry_t *prev_pae = vec_elt_at_index((*applied_hash_aces), pae->prev_applied_entry_index); + ASSERT(prev_pae->next_applied_entry_index == old_index); + prev_pae->next_applied_entry_index = new_index; + } else { + /* first entry - so the hash points to it, update */ + add_del_hashtable_entry(am, sw_if_index, is_input, + applied_hash_aces, new_index, 1); + ASSERT(pae->tail_applied_entry_index != ~0); + } + if (pae->next_applied_entry_index != ~0) { + applied_hash_ace_entry_t *next_pae = vec_elt_at_index((*applied_hash_aces), pae->next_applied_entry_index); + ASSERT(next_pae->prev_applied_entry_index == old_index); + next_pae->prev_applied_entry_index = new_index; + } else { + /* + * Moving the very last entry, so we need to update the tail pointer in the first one. + */ + u32 head_index = find_head_applied_ace_index(applied_hash_aces, old_index); + ASSERT(head_index != ~0); + applied_hash_ace_entry_t *head_pae = vec_elt_at_index((*applied_hash_aces), head_index); + + ASSERT(head_pae->tail_applied_entry_index == old_index); + head_pae->tail_applied_entry_index = new_index; + } + /* invalidate the old entry */ + pae->prev_applied_entry_index = ~0; + pae->next_applied_entry_index = ~0; + pae->tail_applied_entry_index = ~0; +} + +static void +deactivate_applied_ace_hash_entry(acl_main_t *am, + u32 sw_if_index, u8 is_input, + applied_hash_ace_entry_t **applied_hash_aces, + u32 old_index) +{ + applied_hash_ace_entry_t *pae = vec_elt_at_index((*applied_hash_aces), old_index); + DBG("UNAPPLY DEACTIVATE: sw_if_index %d is_input %d, applied index %d", sw_if_index, is_input, old_index); + + if (pae->prev_applied_entry_index != ~0) { + DBG("UNAPPLY = index %d has prev_applied_entry_index %d", old_index, pae->prev_applied_entry_index); + applied_hash_ace_entry_t *prev_pae = vec_elt_at_index((*applied_hash_aces), pae->prev_applied_entry_index); + ASSERT(prev_pae->next_applied_entry_index == old_index); + prev_pae->next_applied_entry_index = pae->next_applied_entry_index; + if (pae->next_applied_entry_index == ~0) { + /* it was a last entry we removed, update the pointer on the first one */ + u32 head_index = find_head_applied_ace_index(applied_hash_aces, old_index); + DBG("UNAPPLY = index %d head index to update %d", old_index, head_index); + ASSERT(head_index != ~0); + applied_hash_ace_entry_t *head_pae = vec_elt_at_index((*applied_hash_aces), head_index); + + ASSERT(head_pae->tail_applied_entry_index == old_index); + head_pae->tail_applied_entry_index = pae->prev_applied_entry_index; + } else { + applied_hash_ace_entry_t *next_pae = vec_elt_at_index((*applied_hash_aces), pae->next_applied_entry_index); + next_pae->prev_applied_entry_index = pae->prev_applied_entry_index; + } + } else { + /* It was the first entry. We need either to reset the hash entry or delete it */ + if (pae->next_applied_entry_index != ~0) { + /* the next element becomes the new first one, so needs the tail pointer to be set */ + applied_hash_ace_entry_t *next_pae = vec_elt_at_index((*applied_hash_aces), pae->next_applied_entry_index); + ASSERT(pae->tail_applied_entry_index != ~0); + next_pae->tail_applied_entry_index = pae->tail_applied_entry_index; + DBG("Resetting the hash table entry from %d to %d, setting tail index to %d", old_index, pae->next_applied_entry_index, pae->tail_applied_entry_index); + /* unlink from the next element */ + next_pae->prev_applied_entry_index = ~0; + add_del_hashtable_entry(am, sw_if_index, is_input, + applied_hash_aces, pae->next_applied_entry_index, 1); + } else { + /* no next entry, so just delete the entry in the hash table */ + add_del_hashtable_entry(am, sw_if_index, is_input, + applied_hash_aces, old_index, 0); + } + } + /* invalidate the old entry */ + pae->prev_applied_entry_index = ~0; + pae->next_applied_entry_index = ~0; + pae->tail_applied_entry_index = ~0; +} + + +static void +hash_acl_build_applied_lookup_bitmap(acl_main_t *am, u32 sw_if_index, u8 is_input) +{ + int i; + uword *new_lookup_bitmap = 0; + applied_hash_acl_info_t **applied_hash_acls = is_input ? &am->input_applied_hash_acl_info_by_sw_if_index + : &am->output_applied_hash_acl_info_by_sw_if_index; + applied_hash_acl_info_t *pal = vec_elt_at_index((*applied_hash_acls), sw_if_index); + for(i=0; i < vec_len(pal->applied_acls); i++) { + u32 a_acl_index = *vec_elt_at_index((pal->applied_acls), i); + hash_acl_info_t *ha = vec_elt_at_index(am->hash_acl_infos, a_acl_index); + DBG("Update bitmask = %U or %U (acl_index %d)\n", format_bitmap_hex, new_lookup_bitmap, + format_bitmap_hex, ha->mask_type_index_bitmap, a_acl_index); + new_lookup_bitmap = clib_bitmap_or(new_lookup_bitmap, + ha->mask_type_index_bitmap); + } + uword *old_lookup_bitmap = pal->mask_type_index_bitmap; + pal->mask_type_index_bitmap = new_lookup_bitmap; + clib_bitmap_free(old_lookup_bitmap); +} + +void +hash_acl_unapply(acl_main_t *am, u32 sw_if_index, u8 is_input, int acl_index) +{ + int i; + + DBG0("HASH ACL unapply: sw_if_index %d is_input %d acl %d", sw_if_index, is_input, acl_index); + applied_hash_acl_info_t **applied_hash_acls = is_input ? &am->input_applied_hash_acl_info_by_sw_if_index + : &am->output_applied_hash_acl_info_by_sw_if_index; + applied_hash_acl_info_t *pal = vec_elt_at_index((*applied_hash_acls), sw_if_index); + + hash_acl_info_t *ha = vec_elt_at_index(am->hash_acl_infos, acl_index); + u32 **hash_acl_applied_sw_if_index = is_input ? &ha->inbound_sw_if_index_list + : &ha->outbound_sw_if_index_list; + + /* remove this acl# from the list of applied hash acls */ + u32 index = vec_search(pal->applied_acls, acl_index); + if (index == ~0) { + clib_warning("BUG: trying to unapply unapplied acl_index %d on sw_if_index %d is_input %d", + acl_index, sw_if_index, is_input); + return; + } + vec_del1(pal->applied_acls, index); + + u32 index2 = vec_search((*hash_acl_applied_sw_if_index), sw_if_index); + if (index2 == ~0) { + clib_warning("BUG: trying to unapply twice acl_index %d on (sw_if_index %d) is_input %d", + acl_index, sw_if_index, is_input); + return; + } + vec_del1((*hash_acl_applied_sw_if_index), index2); + + applied_hash_ace_entry_t **applied_hash_aces = get_applied_hash_aces(am, is_input, sw_if_index); + + for(i=0; i < vec_len((*applied_hash_aces)); i++) { + if (vec_elt_at_index(*applied_hash_aces,i)->acl_index == acl_index) { + DBG("Found applied ACL#%d at applied index %d", acl_index, i); + break; + } + } + if (vec_len((*applied_hash_aces)) <= i) { + DBG("Did not find applied ACL#%d at sw_if_index %d", acl_index, sw_if_index); + /* we went all the way without finding any entries. Probably a list was empty. */ + return; + } + + void *oldheap = hash_acl_set_heap(am); + int base_offset = i; + int tail_offset = base_offset + vec_len(ha->rules); + int tail_len = vec_len((*applied_hash_aces)) - tail_offset; + DBG("base_offset: %d, tail_offset: %d, tail_len: %d", base_offset, tail_offset, tail_len); + + for(i=0; i < vec_len(ha->rules); i ++) { + deactivate_applied_ace_hash_entry(am, sw_if_index, is_input, + applied_hash_aces, base_offset + i); + } + for(i=0; i < tail_len; i ++) { + /* move the entry at tail offset to base offset */ + /* that is, from (tail_offset+i) -> (base_offset+i) */ + DBG("UNAPPLY MOVE: sw_if_index %d is_input %d, applied index %d ->", sw_if_index, is_input, tail_offset+i, base_offset + i); + move_applied_ace_hash_entry(am, sw_if_index, is_input, applied_hash_aces, tail_offset + i, base_offset + i); + } + /* trim the end of the vector */ + _vec_len((*applied_hash_aces)) -= vec_len(ha->rules); + + applied_hash_entries_analyze(am, applied_hash_aces); + + /* After deletion we might not need some of the mask-types anymore... */ + hash_acl_build_applied_lookup_bitmap(am, sw_if_index, is_input); + clib_mem_set_heap (oldheap); +} + +/* + * Create the applied ACEs and update the hash table, + * taking into account that the ACL may not be the last + * in the vector of applied ACLs. + * + * For now, walk from the end of the vector and unapply the ACLs, + * then apply the one in question and reapply the rest. + */ + +void +hash_acl_reapply(acl_main_t *am, u32 sw_if_index, u8 is_input, int acl_index) +{ + u32 **applied_acls = is_input ? vec_elt_at_index(am->input_acl_vec_by_sw_if_index, sw_if_index) + : vec_elt_at_index(am->output_acl_vec_by_sw_if_index, sw_if_index); + int i; + int start_index = vec_search((*applied_acls), acl_index); + /* + * This function is called after we find out the sw_if_index where ACL is applied. + * If the by-sw_if_index vector does not have the ACL#, then it's a bug. + */ + ASSERT(start_index < vec_len(*applied_acls)); + + /* unapply all the ACLs till the current one */ + for(i = vec_len(*applied_acls) - 1; i > start_index; i--) { + hash_acl_unapply(am, sw_if_index, is_input, *vec_elt_at_index(*applied_acls, i)); + } + for(i = start_index; i < vec_len(*applied_acls); i++) { + hash_acl_apply(am, sw_if_index, is_input, *vec_elt_at_index(*applied_acls, i)); + } +} + +static void +make_address_mask(ip46_address_t *addr, u8 is_ipv6, u8 prefix_len) +{ + if (is_ipv6) { + ip6_address_mask_from_width(&addr->ip6, prefix_len); + } else { + /* FIXME: this may not be correct way */ + ip6_address_mask_from_width(&addr->ip6, prefix_len + 3*32); + ip46_address_mask_ip4(addr); + } +} + +static u8 +make_port_mask(u16 *portmask, u16 port_first, u16 port_last) +{ + if (port_first == port_last) { + *portmask = 0xffff; + /* single port is representable by masked value */ + return 0; + } + if ((port_first == 0) && (port_last == 65535)) { + *portmask = 0; + /* wildcard port is representable by a masked value */ + return 0; + } + + /* + * For now match all the ports, later + * here might be a better optimization which would + * pick out bitmaskable portranges. + * + * However, adding a new mask type potentially + * adds a per-packet extra lookup, so the benefit is not clear. + */ + *portmask = 0; + /* This port range can't be represented via bitmask exactly. */ + return 1; +} + +static void +make_mask_and_match_from_rule(fa_5tuple_t *mask, acl_rule_t *r, hash_ace_info_t *hi, int match_nonfirst_fragment) +{ + memset(mask, 0, sizeof(*mask)); + memset(&hi->match, 0, sizeof(hi->match)); + hi->action = r->is_permit; + + /* we will need to be matching based on sw_if_index, direction, and mask_type_index when applied */ + mask->pkt.sw_if_index = ~0; + mask->pkt.is_input = 1; + /* we will assign the match of mask_type_index later when we find it*/ + mask->pkt.mask_type_index_lsb = ~0; + + mask->pkt.is_ip6 = 1; + hi->match.pkt.is_ip6 = r->is_ipv6; + + make_address_mask(&mask->addr[0], r->is_ipv6, r->src_prefixlen); + hi->match.addr[0] = r->src; + make_address_mask(&mask->addr[1], r->is_ipv6, r->dst_prefixlen); + hi->match.addr[1] = r->dst; + + if (r->proto != 0) { + mask->l4.proto = ~0; /* L4 proto needs to be matched */ + hi->match.l4.proto = r->proto; + if (match_nonfirst_fragment) { + /* match the non-first fragments only */ + mask->pkt.is_nonfirst_fragment = 1; + hi->match.pkt.is_nonfirst_fragment = 1; + } else { + /* Calculate the src/dst port masks and make the src/dst port matches accordingly */ + hi->src_portrange_not_powerof2 = make_port_mask(&mask->l4.port[0], r->src_port_or_type_first, r->src_port_or_type_last); + hi->match.l4.port[0] = r->src_port_or_type_first & mask->l4.port[0]; + hi->dst_portrange_not_powerof2 = make_port_mask(&mask->l4.port[1], r->dst_port_or_code_first, r->dst_port_or_code_last); + hi->match.l4.port[1] = r->dst_port_or_code_first & mask->l4.port[1]; + /* L4 info must be valid in order to match */ + mask->pkt.l4_valid = 1; + hi->match.pkt.l4_valid = 1; + /* And we must set the mask to check that it is an initial fragment */ + mask->pkt.is_nonfirst_fragment = 1; + hi->match.pkt.is_nonfirst_fragment = 0; + if ((r->proto == IPPROTO_TCP) && (r->tcp_flags_mask != 0)) { + /* if we want to match on TCP flags, they must be masked off as well */ + mask->pkt.tcp_flags = r->tcp_flags_mask; + hi->match.pkt.tcp_flags = r->tcp_flags_value; + /* and the flags need to be present within the packet being matched */ + mask->pkt.tcp_flags_valid = 1; + hi->match.pkt.tcp_flags_valid = 1; + } + } + } + /* Sanitize the mask and the match */ + u64 *pmask = (u64 *)mask; + u64 *pmatch = (u64 *)&hi->match; + int j; + for(j=0; j<6; j++) { + pmatch[j] = pmatch[j] & pmask[j]; + } +} + +static u32 +find_mask_type_index(acl_main_t *am, fa_5tuple_t *mask) +{ + ace_mask_type_entry_t *mte; + /* *INDENT-OFF* */ + pool_foreach(mte, am->ace_mask_type_pool, + ({ + if(memcmp(&mte->mask, mask, sizeof(*mask)) == 0) + return (mte - am->ace_mask_type_pool); + })); + /* *INDENT-ON* */ + return ~0; +} + +static u32 +assign_mask_type_index(acl_main_t *am, fa_5tuple_t *mask) +{ + u32 mask_type_index = find_mask_type_index(am, mask); + ace_mask_type_entry_t *mte; + if(~0 == mask_type_index) { + pool_get_aligned (am->ace_mask_type_pool, mte, CLIB_CACHE_LINE_BYTES); + mask_type_index = mte - am->ace_mask_type_pool; + clib_memcpy(&mte->mask, mask, sizeof(mte->mask)); + mte->refcount = 0; + /* + * We can use only 16 bits, since in the match there is only u16 field. + * Realistically, once you go to 64K of mask types, it is a huge + * problem anyway, so we might as well stop half way. + */ + ASSERT(mask_type_index < 32768); + } + mte = am->ace_mask_type_pool + mask_type_index; + mte->refcount++; + return mask_type_index; +} + +static void +release_mask_type_index(acl_main_t *am, u32 mask_type_index) +{ + ace_mask_type_entry_t *mte = pool_elt_at_index(am->ace_mask_type_pool, mask_type_index); + mte->refcount--; + if (mte->refcount == 0) { + /* we are not using this entry anymore */ + pool_put(am->ace_mask_type_pool, mte); + } +} + +void hash_acl_add(acl_main_t *am, int acl_index) +{ + void *oldheap = hash_acl_set_heap(am); + DBG("HASH ACL add : %d", acl_index); + int i; + acl_list_t *a = &am->acls[acl_index]; + vec_validate(am->hash_acl_infos, acl_index); + hash_acl_info_t *ha = vec_elt_at_index(am->hash_acl_infos, acl_index); + memset(ha, 0, sizeof(*ha)); + + /* walk the newly added ACL entries and ensure that for each of them there + is a mask type, increment a reference count for that mask type */ + for(i=0; i < a->count; i++) { + hash_ace_info_t ace_info; + fa_5tuple_t mask; + memset(&ace_info, 0, sizeof(ace_info)); + ace_info.acl_index = acl_index; + ace_info.ace_index = i; + + make_mask_and_match_from_rule(&mask, &a->rules[i], &ace_info, 0); + ace_info.mask_type_index = assign_mask_type_index(am, &mask); + /* assign the mask type index for matching itself */ + ace_info.match.pkt.mask_type_index_lsb = ace_info.mask_type_index; + DBG("ACE: %d mask_type_index: %d", i, ace_info.mask_type_index); + /* Ensure a given index is set in the mask type index bitmap for this ACL */ + ha->mask_type_index_bitmap = clib_bitmap_set(ha->mask_type_index_bitmap, ace_info.mask_type_index, 1); + vec_add1(ha->rules, ace_info); + if (am->l4_match_nonfirst_fragment) { + /* add the second rule which matches the noninitial fragments with the respective mask */ + make_mask_and_match_from_rule(&mask, &a->rules[i], &ace_info, 1); + ace_info.mask_type_index = assign_mask_type_index(am, &mask); + ace_info.match.pkt.mask_type_index_lsb = ace_info.mask_type_index; + DBG("ACE: %d (non-initial frags) mask_type_index: %d", i, ace_info.mask_type_index); + /* Ensure a given index is set in the mask type index bitmap for this ACL */ + ha->mask_type_index_bitmap = clib_bitmap_set(ha->mask_type_index_bitmap, ace_info.mask_type_index, 1); + vec_add1(ha->rules, ace_info); + } + } + /* + * if an ACL is applied somewhere, fill the corresponding lookup data structures. + * We need to take care if the ACL is not the last one in the vector of ACLs applied to the interface. + */ + if (acl_index < vec_len(am->input_sw_if_index_vec_by_acl)) { + u32 *sw_if_index; + vec_foreach(sw_if_index, am->input_sw_if_index_vec_by_acl[acl_index]) { + hash_acl_reapply(am, *sw_if_index, 1, acl_index); + } + } + if (acl_index < vec_len(am->output_sw_if_index_vec_by_acl)) { + u32 *sw_if_index; + vec_foreach(sw_if_index, am->output_sw_if_index_vec_by_acl[acl_index]) { + hash_acl_reapply(am, *sw_if_index, 0, acl_index); + } + } + clib_mem_set_heap (oldheap); +} + +void hash_acl_delete(acl_main_t *am, int acl_index) +{ + void *oldheap = hash_acl_set_heap(am); + DBG0("HASH ACL delete : %d", acl_index); + /* + * If the ACL is applied somewhere, remove the references of it (call hash_acl_unapply) + * this is a different behavior from the linear lookup where an empty ACL is "deny all", + * + * However, following vpp-dev discussion the ACL that is referenced elsewhere + * should not be possible to delete, and the change adding this also adds + * the safeguards to that respect, so this is not a problem. + * + * The part to rememeber is that this routine is called in process of reapplication + * during the acl_add_replace() API call - the old acl ruleset is deleted, then + * the new one is added, without the change in the applied ACLs - so this case + * has to be handled. + */ + hash_acl_info_t *ha = vec_elt_at_index(am->hash_acl_infos, acl_index); + u32 *interface_list_copy = 0; + { + u32 *sw_if_index; + interface_list_copy = vec_dup(ha->inbound_sw_if_index_list); + vec_foreach(sw_if_index, interface_list_copy) { + hash_acl_unapply(am, *sw_if_index, 1, acl_index); + } + vec_free(interface_list_copy); + interface_list_copy = vec_dup(ha->outbound_sw_if_index_list); + vec_foreach(sw_if_index, interface_list_copy) { + hash_acl_unapply(am, *sw_if_index, 0, acl_index); + } + } + + /* walk the mask types for the ACL about-to-be-deleted, and decrease + * the reference count, possibly freeing up some of them */ + int i; + for(i=0; i < vec_len(ha->rules); i++) { + release_mask_type_index(am, ha->rules[i].mask_type_index); + } + clib_bitmap_free(ha->mask_type_index_bitmap); + vec_free(ha->rules); + clib_mem_set_heap (oldheap); +} + +u8 +hash_multi_acl_match_5tuple (u32 sw_if_index, fa_5tuple_t * pkt_5tuple, int is_l2, + int is_ip6, int is_input, u32 * acl_match_p, + u32 * rule_match_p, u32 * trace_bitmap) +{ + acl_main_t *am = &acl_main; + applied_hash_ace_entry_t **applied_hash_aces = get_applied_hash_aces(am, is_input, sw_if_index); + u32 match_index = multi_acl_match_get_applied_ace_index(am, pkt_5tuple); + if (match_index < vec_len((*applied_hash_aces))) { + applied_hash_ace_entry_t *pae = vec_elt_at_index((*applied_hash_aces), match_index); + pae->hitcount++; + *acl_match_p = pae->acl_index; + *rule_match_p = pae->ace_index; + return pae->action; + } + return 0; +} + + +void +show_hash_acl_hash (vlib_main_t * vm, acl_main_t *am, u32 verbose) +{ + vlib_cli_output(vm, "\nACL lookup hash table:\n%U\n", + BV (format_bihash), &am->acl_lookup_hash, verbose); +} diff --git a/src/plugins/acl/hash_lookup.h b/src/plugins/acl/hash_lookup.h new file mode 100644 index 00000000..2d7058e8 --- /dev/null +++ b/src/plugins/acl/hash_lookup.h @@ -0,0 +1,64 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#ifndef _ACL_HASH_LOOKUP_H_ +#define _ACL_HASH_LOOKUP_H_ + +#include <stddef.h> +#include "acl.h" + +/* + * Do the necessary to logically apply the ACL to the existing vector of ACLs looked up + * during the packet processing + */ + +void hash_acl_apply(acl_main_t *am, u32 sw_if_index, u8 is_input, int acl_index); + +/* Remove the ACL from the packet processing lookups on a given interface */ + +void hash_acl_unapply(acl_main_t *am, u32 sw_if_index, u8 is_input, int acl_index); + +/* + * Add an ACL or delete an ACL. ACL may already have been referenced elsewhere, + * so potentially we also need to do the work to enable the lookups. + */ + +void hash_acl_add(acl_main_t *am, int acl_index); +void hash_acl_delete(acl_main_t *am, int acl_index); + +/* + * Do the work required to match a given 5-tuple from the packet, + * and return the action as well as populate the values pointed + * to by the *_match_p pointers and maybe trace_bitmap. + */ + +u8 +hash_multi_acl_match_5tuple (u32 sw_if_index, fa_5tuple_t * pkt_5tuple, int is_l2, + int is_ip6, int is_input, u32 * acl_match_p, + u32 * rule_match_p, u32 * trace_bitmap); + + +/* + * The debug function to show the contents of the ACL lookup hash + */ +void show_hash_acl_hash(vlib_main_t * vm, acl_main_t *am, u32 verbose); + +/* Debug functions to turn validate/trace on and off */ +void acl_plugin_hash_acl_set_validate_heap(acl_main_t *am, int on); +void acl_plugin_hash_acl_set_trace_heap(acl_main_t *am, int on); + +#endif diff --git a/src/plugins/acl/hash_lookup_private.h b/src/plugins/acl/hash_lookup_private.h new file mode 100644 index 00000000..bc621416 --- /dev/null +++ b/src/plugins/acl/hash_lookup_private.h @@ -0,0 +1,33 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#define ACL_HASH_LOOKUP_DEBUG 0 + +#if ACL_HASH_LOOKUP_DEBUG == 1 +#define DBG0(...) clib_warning(__VA_ARGS__) +#define DBG(...) +#define DBG_UNIX_LOG(...) +#elif ACL_HASH_LOOKUP_DEBUG == 2 +#define DBG0(...) clib_warning(__VA_ARGS__) +#define DBG(...) clib_warning(__VA_ARGS__) +#define DBG_UNIX_LOG(...) clib_unix_warning(__VA_ARGS__) +#else +#define DBG0(...) +#define DBG(...) +#define DBG_UNIX_LOG(...) +#endif + diff --git a/src/plugins/acl/hash_lookup_types.h b/src/plugins/acl/hash_lookup_types.h new file mode 100644 index 00000000..1fa197ec --- /dev/null +++ b/src/plugins/acl/hash_lookup_types.h @@ -0,0 +1,107 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#ifndef _ACL_HASH_LOOKUP_TYPES_H_ +#define _ACL_HASH_LOOKUP_TYPES_H_ + +/* The structure representing the single entry with hash representation */ +typedef struct { + /* these two entries refer to the original ACL# and rule# within that ACL */ + u32 acl_index; + u32 ace_index; + + u32 mask_type_index; + u8 src_portrange_not_powerof2; + u8 dst_portrange_not_powerof2; + + fa_5tuple_t match; + u8 action; +} hash_ace_info_t; + +/* + * The structure holding the information necessary for the hash-based ACL operation + */ +typedef struct { + /* The mask types present in this ACL */ + uword *mask_type_index_bitmap; + /* hash ACL applied on these interfaces */ + u32 *inbound_sw_if_index_list; + u32 *outbound_sw_if_index_list; + hash_ace_info_t *rules; +} hash_acl_info_t; + +typedef struct { + /* original non-compiled ACL */ + u32 acl_index; + u32 ace_index; + /* the index of the hash_ace_info_t */ + u32 hash_ace_info_index; + /* + * in case of the same key having multiple entries, + * this holds the index of the next entry. + */ + u32 next_applied_entry_index; + /* + * previous entry in the list of the chained ones, + * if ~0 then this is entry in the hash. + */ + u32 prev_applied_entry_index; + /* + * chain tail, if this is the first entry + */ + u32 tail_applied_entry_index; + /* + * number of hits on this entry + */ + u64 hitcount; + /* + * Action of this applied ACE + */ + u8 action; +} applied_hash_ace_entry_t; + +typedef struct { + /* + * A logical OR of all the applied_ace_hash_entry_t=> + * hash_ace_info_t=>mask_type_index bits set + */ + uword *mask_type_index_bitmap; + /* applied ACLs so we can track them independently from main ACL module */ + u32 *applied_acls; +} applied_hash_acl_info_t; + + +typedef union { + u64 as_u64; + struct { + u32 applied_entry_index; + u16 reserved_u16; + u8 reserved_u8; + /* means there is some other entry in front intersecting with this one */ + u8 shadowed:1; + u8 need_portrange_check:1; + u8 reserved_flags:6; + }; +} hash_acl_lookup_value_t; + +#define CT_ASSERT_EQUAL(name, x,y) typedef int assert_ ## name ## _compile_time_assertion_failed[((x) == (y))-1] + +CT_ASSERT_EQUAL(hash_acl_lookup_value_t_is_u64, sizeof(hash_acl_lookup_value_t), sizeof(u64)); + +#undef CT_ASSERT_EQUAL + +#endif diff --git a/src/plugins/acl/manual_fns.h b/src/plugins/acl/manual_fns.h new file mode 100644 index 00000000..e00f1abc --- /dev/null +++ b/src/plugins/acl/manual_fns.h @@ -0,0 +1,408 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef included_manual_fns_h +#define included_manual_fns_h + +#include <vnet/ip/format.h> +#include <vnet/ethernet/ethernet.h> + +/* Macro to finish up custom dump fns */ +#define PRINT_S \ + vec_add1 (s, 0); \ + vl_print (handle, (char *)s); \ + vec_free (s); + +static inline void +vl_api_acl_rule_t_array_endian(vl_api_acl_rule_t *rules, u32 count) +{ + u32 i; + for(i=0; i<count; i++) { + vl_api_acl_rule_t_endian (&rules[i]); + } +} + +static inline void +vl_api_macip_acl_rule_t_array_endian(vl_api_macip_acl_rule_t *rules, u32 count) +{ + u32 i; + for(i=0; i<count; i++) { + vl_api_macip_acl_rule_t_endian (&rules[i]); + } +} + +static inline void +vl_api_acl_details_t_endian (vl_api_acl_details_t * a) +{ + a->_vl_msg_id = clib_net_to_host_u16 (a->_vl_msg_id); + a->context = clib_net_to_host_u32 (a->context); + a->acl_index = clib_net_to_host_u32 (a->acl_index); + /* a->tag[0..63] = a->tag[0..63] (no-op) */ + a->count = clib_net_to_host_u32 (a->count); + vl_api_acl_rule_t_array_endian (a->r, a->count); +} + +static inline void +vl_api_macip_acl_details_t_endian (vl_api_macip_acl_details_t * a) +{ + a->_vl_msg_id = clib_net_to_host_u16 (a->_vl_msg_id); + a->context = clib_net_to_host_u32 (a->context); + a->acl_index = clib_net_to_host_u32 (a->acl_index); + /* a->tag[0..63] = a->tag[0..63] (no-op) */ + a->count = clib_net_to_host_u32 (a->count); + vl_api_macip_acl_rule_t_array_endian (a->r, a->count); +} + + +static inline void +vl_api_acl_add_replace_t_endian (vl_api_acl_add_replace_t * a) +{ + a->_vl_msg_id = clib_net_to_host_u16 (a->_vl_msg_id); + a->client_index = clib_net_to_host_u32 (a->client_index); + a->context = clib_net_to_host_u32 (a->context); + a->acl_index = clib_net_to_host_u32 (a->acl_index); + /* a->tag[0..63] = a->tag[0..63] (no-op) */ + a->count = clib_net_to_host_u32 (a->count); + vl_api_acl_rule_t_array_endian (a->r, a->count); +} + +static inline void +vl_api_macip_acl_add_t_endian (vl_api_macip_acl_add_t * a) +{ + a->_vl_msg_id = clib_net_to_host_u16 (a->_vl_msg_id); + a->client_index = clib_net_to_host_u32 (a->client_index); + a->context = clib_net_to_host_u32 (a->context); + /* a->tag[0..63] = a->tag[0..63] (no-op) */ + a->count = clib_net_to_host_u32 (a->count); + vl_api_macip_acl_rule_t_array_endian (a->r, a->count); +} + +static inline void +vl_api_macip_acl_add_replace_t_endian (vl_api_macip_acl_add_replace_t * a) +{ + a->_vl_msg_id = clib_net_to_host_u16 (a->_vl_msg_id); + a->client_index = clib_net_to_host_u32 (a->client_index); + a->context = clib_net_to_host_u32 (a->context); + a->acl_index = clib_net_to_host_u32 (a->acl_index); + /* a->tag[0..63] = a->tag[0..63] (no-op) */ + a->count = clib_net_to_host_u32 (a->count); + vl_api_macip_acl_rule_t_array_endian (a->r, a->count); +} + +static inline u8 * +format_acl_action(u8 *s, u8 action) +{ + switch(action) { + case 0: + s = format (s, "deny"); + break; + case 1: + s = format (s, "permit"); + break; + case 2: + s = format (s, "permit+reflect"); + break; + default: + s = format (s, "action %d", action); + } + return(s); +} + +static inline void * +vl_api_acl_rule_t_print (vl_api_acl_rule_t * a, void *handle) +{ + u8 *s; + + s = format (0, " %s ", a->is_ipv6 ? "ipv6" : "ipv4"); + s = format_acl_action (s, a->is_permit); + s = format (s, " \\\n"); + + if (a->is_ipv6) + s = format (s, " src %U/%d dst %U/%d \\\n", + format_ip6_address, a->src_ip_addr, a->src_ip_prefix_len, + format_ip6_address, a->dst_ip_addr, a->dst_ip_prefix_len); + else + s = format (s, " src %U/%d dst %U/%d \\\n", + format_ip4_address, a->src_ip_addr, a->src_ip_prefix_len, + format_ip4_address, a->dst_ip_addr, a->dst_ip_prefix_len); + s = format (s, " proto %d \\\n", a->proto); + s = format (s, " sport %d-%d dport %d-%d \\\n", + clib_net_to_host_u16 (a->srcport_or_icmptype_first), + clib_net_to_host_u16 (a->srcport_or_icmptype_last), + clib_net_to_host_u16 (a->dstport_or_icmpcode_first), + clib_net_to_host_u16 (a->dstport_or_icmpcode_last)); + + s = format (s, " tcpflags %u mask %u, \\", + a->tcp_flags_value, a->tcp_flags_mask); + PRINT_S; + return handle; +} + + + +static inline void * +vl_api_macip_acl_rule_t_print (vl_api_macip_acl_rule_t * a, void *handle) +{ + u8 *s; + + s = format (0, " %s %s \\\n", a->is_ipv6 ? "ipv6" : "ipv4", + a->is_permit ? "permit" : "deny"); + + s = format (s, " src mac %U mask %U \\\n", + format_ethernet_address, a->src_mac, + format_ethernet_address, a->src_mac_mask); + + if (a->is_ipv6) + s = format (s, " src ip %U/%d, \\", + format_ip6_address, a->src_ip_addr, a->src_ip_prefix_len); + else + s = format (s, " src ip %U/%d, \\", + format_ip4_address, a->src_ip_addr, a->src_ip_prefix_len); + + PRINT_S; + return handle; +} + +static inline void * +vl_api_acl_add_replace_t_print (vl_api_acl_add_replace_t * a, void *handle) +{ + u8 *s = 0; + int i; + u32 acl_index = clib_net_to_host_u32 (a->acl_index); + u32 count = clib_net_to_host_u32 (a->count); + if (count > 0x100000) + { + s = format (s, "WARN: acl_add_replace count endianness wrong? Fixup to avoid long loop.\n"); + count = a->count; + } + + s = format (s, "SCRIPT: acl_add_replace %d count %d ", + acl_index, count); + + if (a->tag[0]) + s = format (s, "tag %s ", a->tag); + + s = format(s, "\\\n"); + PRINT_S; + + for (i = 0; i < count; i++) + vl_api_acl_rule_t_print (&a->r[i], handle); + + s = format(s, "\n"); + PRINT_S; + return handle; +} + +static inline void * +vl_api_acl_del_t_print (vl_api_macip_acl_del_t * a, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: acl_del %d ", + clib_host_to_net_u32 (a->acl_index)); + + PRINT_S; + return handle; +} + + +static inline void * +vl_api_acl_details_t_print (vl_api_acl_details_t * a, void *handle) +{ + u8 *s = 0; + int i; + u32 acl_index = clib_net_to_host_u32 (a->acl_index); + u32 count = clib_net_to_host_u32 (a->count); + if (count > 0x100000) + { + s = format (s, "WARN: acl_defails count endianness wrong? Fixup to avoid long loop.\n"); + count = a->count; + } + + s = format (s, "acl_details index %d count %d ", + acl_index, count); + + if (a->tag[0]) + s = format (s, "tag %s ", a->tag); + + s = format(s, "\n"); + PRINT_S; + + for (i = 0; i < count; i++) + vl_api_acl_rule_t_print (&a->r[i], handle); + + return handle; +} + +static inline void * +vl_api_macip_acl_details_t_print (vl_api_macip_acl_details_t * a, + void *handle) +{ + u8 *s = 0; + int i; + u32 acl_index = clib_net_to_host_u32 (a->acl_index); + u32 count = clib_net_to_host_u32 (a->count); + if (count > 0x100000) + { + s = format (s, "WARN: macip_acl_defails count endianness wrong? Fixup to avoid long loop.\n"); + count = a->count; + } + + s = format (s, "macip_acl_details index %d count %d ", + acl_index, count); + + if (a->tag[0]) + s = format (s, "tag %s ", a->tag); + + s = format(s, "\n"); + PRINT_S; + + for (i = 0; i < count; i++) + vl_api_macip_acl_rule_t_print (&a->r[i], handle); + + return handle; +} + +static inline void * +vl_api_macip_acl_add_t_print (vl_api_macip_acl_add_t * a, void *handle) +{ + u8 *s = 0; + int i; + u32 count = clib_net_to_host_u32 (a->count); + if (count > 0x100000) + { + s = format (s, "WARN: macip_acl_add count endianness wrong? Fixup to avoid long loop.\n"); + count = a->count; + } + + s = format (s, "SCRIPT: macip_acl_add "); + if (a->tag[0]) + s = format (s, "tag %s ", a->tag); + + s = format (s, "count %d \\\n", count); + + PRINT_S; + + for (i = 0; i < count; i++) + vl_api_macip_acl_rule_t_print (&a->r[i], handle); + + s = format (0, "\n"); + PRINT_S; + + return handle; +} + +static inline void * +vl_api_macip_acl_add_replace_t_print (vl_api_macip_acl_add_replace_t * a, void *handle) +{ + u8 *s = 0; + int i; + u32 acl_index = clib_net_to_host_u32 (a->acl_index); + u32 count = clib_net_to_host_u32 (a->count); + if (count > 0x100000) + { + s = format (s, "WARN: macip_acl_add_replace count endianness wrong? Fixup to avoid long loop.\n"); + count = a->count; + } + + s = format (s, "SCRIPT: macip_acl_add_replace %d count %d ", + acl_index, count); + if (a->tag[0]) + s = format (s, "tag %s ", a->tag); + + s = format (s, "count %d \\\n", count); + + PRINT_S; + + for (i = 0; i < count; i++) + vl_api_macip_acl_rule_t_print (&a->r[i], handle); + + s = format (0, "\n"); + PRINT_S; + + return handle; +} + +static inline void * +vl_api_acl_interface_set_acl_list_t_print (vl_api_acl_interface_set_acl_list_t + * a, void *handle) +{ + u8 *s; + int i; + + s = format + (0, "SCRIPT: acl_interface_set_acl_list sw_if_index %d count %d\n", + clib_net_to_host_u32 (a->sw_if_index), (u32) a->count); + + s = format (s, " input "); + + for (i = 0; i < a->count; i++) + { + if (i == a->n_input) + s = format (s, "output "); + s = format (s, "%d ", clib_net_to_host_u32 (a->acls[i])); + } + + PRINT_S; + return handle; +} + +static inline void * +vl_api_acl_interface_add_del_t_print (vl_api_acl_interface_add_del_t * a, + void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: acl_interface_add_del sw_if_index %d acl %d ", + clib_net_to_host_u32 (a->sw_if_index), + clib_net_to_host_u32 (a->acl_index)); + s = format (s, "%s %s", + a->is_input ? "input" : "output", a->is_add ? "add" : "del"); + + PRINT_S; + return handle; +} + +static inline void *vl_api_macip_acl_interface_add_del_t_print + (vl_api_macip_acl_interface_add_del_t * a, void *handle) +{ + u8 *s; + + s = format + (0, + "SCRIPT: macip_acl_interface_add_del sw_if_index %d acl_index %d ", + clib_net_to_host_u32 (a->sw_if_index), + clib_net_to_host_u32 (a->acl_index)); + s = format (s, "%s", a->is_add ? "add" : "del"); + + PRINT_S; + return handle; +} + + +static inline void * +vl_api_macip_acl_del_t_print (vl_api_macip_acl_del_t * a, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: macip_acl_del %d ", + clib_host_to_net_u32 (a->acl_index)); + + PRINT_S; + return handle; +} + + +#endif /* included_manual_fns_h */ diff --git a/src/plugins/dpdk.am b/src/plugins/dpdk.am new file mode 100644 index 00000000..15195a21 --- /dev/null +++ b/src/plugins/dpdk.am @@ -0,0 +1,73 @@ +# Copyright (c) 2016 Cisco Systems, Inc. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +vppapitestplugins_LTLIBRARIES += dpdk_test_plugin.la +vppplugins_LTLIBRARIES += dpdk_plugin.la + +if ENABLE_DPDK_SHARED +dpdk_plugin_la_LDFLAGS = $(AM_LDFLAGS) -ldpdk +else +dpdk_plugin_la_LDFLAGS = $(AM_LDFLAGS) -Wl,--whole-archive,-l:libdpdk.a,--no-whole-archive +endif +if WITH_AESNI_MB_LIB +dpdk_plugin_la_LDFLAGS += -Wl,--exclude-libs,libIPSec_MB.a,-l:libIPSec_MB.a +endif +if WITH_ISA_L_CRYPTO_LIB +dpdk_plugin_la_LDFLAGS += -Wl,--exclude-libs,libisal_crypto.a,-l:libisal_crypto.a +endif +if WITH_IBVERBS_LIB +dpdk_plugin_la_LDFLAGS += -Wl,-libverbs +endif +if DPDK_IS_1702_OR_1705 +dpdk_plugin_la_CFLAGS = $(AM_CFLAGS) -DDPDK_VOID_CALLBACK=1 -DDPDK_NO_AEAD=1 +else +dpdk_plugin_la_CFLAGS = $(AM_CFLAGS) -DDPDK_VOID_CALLBACK=0 -DDPDK_NO_AEAD=0 +dpdk_plugin_la_LDFLAGS += -Wl,-lnuma +endif + +dpdk_plugin_la_LDFLAGS += -Wl,-lm,-ldl + +dpdk_plugin_la_SOURCES = \ + dpdk/main.c \ + dpdk/buffer.c \ + dpdk/thread.c \ + dpdk/api/dpdk_api.c \ + dpdk/device/cli.c \ + dpdk/device/common.c \ + dpdk/device/dpdk_priv.h \ + dpdk/device/device.c \ + dpdk/device/format.c \ + dpdk/device/init.c \ + dpdk/device/node.c \ + dpdk/hqos/hqos.c \ + dpdk/ipsec/esp_encrypt.c \ + dpdk/ipsec/esp_decrypt.c \ + dpdk/ipsec/crypto_node.c \ + dpdk/ipsec/cli.c \ + dpdk/ipsec/ipsec.c \ + dpdk/api/dpdk_plugin.api.h + +API_FILES += dpdk/api/dpdk.api + +nobase_include_HEADERS += \ + dpdk/device/dpdk.h \ + dpdk/api/dpdk_all_api_h.h + +nobase_include_HEADERS += \ + dpdk/ipsec/ipsec.h \ + dpdk/ipsec/esp.h + +dpdk_test_plugin_la_SOURCES = \ + dpdk/api/dpdk_test.c dpdk/api/dpdk_plugin.api.h + +# vi:syntax=automake diff --git a/src/plugins/dpdk/api/dpdk.api b/src/plugins/dpdk/api/dpdk.api new file mode 100644 index 00000000..d43f8a36 --- /dev/null +++ b/src/plugins/dpdk/api/dpdk.api @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2015-2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** \brief DPDK interface HQoS pipe profile set request + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param sw_if_index - the interface + @param subport - subport ID + @param pipe - pipe ID within its subport + @param profile - pipe profile ID +*/ +autoreply define sw_interface_set_dpdk_hqos_pipe { + u32 client_index; + u32 context; + u32 sw_if_index; + u32 subport; + u32 pipe; + u32 profile; +}; + +/** \brief DPDK interface HQoS subport parameters set request + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param sw_if_index - the interface + @param subport - subport ID + @param tb_rate - subport token bucket rate (measured in bytes/second) + @param tb_size - subport token bucket size (measured in credits) + @param tc_rate - subport traffic class 0 .. 3 rates (measured in bytes/second) + @param tc_period - enforcement period for rates (measured in milliseconds) +*/ +autoreply define sw_interface_set_dpdk_hqos_subport { + u32 client_index; + u32 context; + u32 sw_if_index; + u32 subport; + u32 tb_rate; + u32 tb_size; + u32 tc_rate[4]; + u32 tc_period; +}; + +/** \brief DPDK interface HQoS tctbl entry set request + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param sw_if_index - the interface + @param entry - entry index ID + @param tc - traffic class (0 .. 3) + @param queue - traffic class queue (0 .. 3) +*/ +autoreply define sw_interface_set_dpdk_hqos_tctbl { + u32 client_index; + u32 context; + u32 sw_if_index; + u32 entry; + u32 tc; + u32 queue; +}; + +/* + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ + diff --git a/src/plugins/dpdk/api/dpdk_all_api_h.h b/src/plugins/dpdk/api/dpdk_all_api_h.h new file mode 100644 index 00000000..15eb98d6 --- /dev/null +++ b/src/plugins/dpdk/api/dpdk_all_api_h.h @@ -0,0 +1,19 @@ + +/* + * dpdk_all_api_h.h - skeleton vpp engine plug-in api #include file + * + * Copyright (c) <current-year> <your-organization> + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* Include the generated file, see BUILT_SOURCES in Makefile.am */ +#include <dpdk/api/dpdk.api.h> diff --git a/src/plugins/dpdk/api/dpdk_api.c b/src/plugins/dpdk/api/dpdk_api.c new file mode 100755 index 00000000..97c4bc75 --- /dev/null +++ b/src/plugins/dpdk/api/dpdk_api.c @@ -0,0 +1,332 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vnet/vnet.h> +#include <vppinfra/vec.h> +#include <vppinfra/error.h> +#include <vppinfra/format.h> +#include <vppinfra/bitmap.h> + +#include <vnet/ethernet/ethernet.h> +#include <dpdk/device/dpdk.h> +#include <vlib/pci/pci.h> + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/stat.h> +#include <sys/mount.h> +#include <string.h> +#include <fcntl.h> + +#include <dpdk/device/dpdk_priv.h> + +#include <vlibapi/api.h> +#include <vlibmemory/api.h> + +/* define message IDs */ +#include <dpdk/api/dpdk_msg_enum.h> + +#define vl_typedefs /* define message structures */ +#include <dpdk/api/dpdk_all_api_h.h> +#undef vl_typedefs + +#define vl_endianfun /* define message structures */ +#include <dpdk/api/dpdk_all_api_h.h> +#undef vl_endianfun + +#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__) + +/* Get the API version number. */ +#define vl_api_version(n,v) static u32 api_version=(v); +#include <dpdk/api/dpdk_all_api_h.h> +#undef vl_api_version + +/* Macro to finish up custom dump fns */ +#define FINISH \ + vec_add1 (s, 0); \ + vl_print (handle, (char *)s); \ + vec_free (s); \ + return handle; + +#include <vlibapi/api_helper_macros.h> + +static void + vl_api_sw_interface_set_dpdk_hqos_pipe_t_handler + (vl_api_sw_interface_set_dpdk_hqos_pipe_t * mp) +{ + vl_api_sw_interface_set_dpdk_hqos_pipe_reply_t *rmp; + int rv = 0; + + dpdk_main_t *dm = &dpdk_main; + dpdk_device_t *xd; + + u32 sw_if_index = ntohl (mp->sw_if_index); + u32 subport = ntohl (mp->subport); + u32 pipe = ntohl (mp->pipe); + u32 profile = ntohl (mp->profile); + vnet_hw_interface_t *hw; + + VALIDATE_SW_IF_INDEX (mp); + + /* hw_if & dpdk device */ + hw = vnet_get_sup_hw_interface (dm->vnet_main, sw_if_index); + + xd = vec_elt_at_index (dm->devices, hw->dev_instance); + + rv = rte_sched_pipe_config (xd->hqos_ht->hqos, subport, pipe, profile); + + BAD_SW_IF_INDEX_LABEL; + + REPLY_MACRO (VL_API_SW_INTERFACE_SET_DPDK_HQOS_PIPE_REPLY); +} + +static void *vl_api_sw_interface_set_dpdk_hqos_pipe_t_print + (vl_api_sw_interface_set_dpdk_hqos_pipe_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: sw_interface_set_dpdk_hqos_pipe "); + + s = format (s, "sw_if_index %u ", ntohl (mp->sw_if_index)); + + s = format (s, "subport %u pipe %u profile %u ", + ntohl (mp->subport), ntohl (mp->pipe), ntohl (mp->profile)); + + FINISH; +} + +static void + vl_api_sw_interface_set_dpdk_hqos_subport_t_handler + (vl_api_sw_interface_set_dpdk_hqos_subport_t * mp) +{ + vl_api_sw_interface_set_dpdk_hqos_subport_reply_t *rmp; + int rv = 0; + + dpdk_main_t *dm = &dpdk_main; + dpdk_device_t *xd; + struct rte_sched_subport_params p; + + u32 sw_if_index = ntohl (mp->sw_if_index); + u32 subport = ntohl (mp->subport); + p.tb_rate = ntohl (mp->tb_rate); + p.tb_size = ntohl (mp->tb_size); + p.tc_rate[0] = ntohl (mp->tc_rate[0]); + p.tc_rate[1] = ntohl (mp->tc_rate[1]); + p.tc_rate[2] = ntohl (mp->tc_rate[2]); + p.tc_rate[3] = ntohl (mp->tc_rate[3]); + p.tc_period = ntohl (mp->tc_period); + + vnet_hw_interface_t *hw; + + VALIDATE_SW_IF_INDEX (mp); + + /* hw_if & dpdk device */ + hw = vnet_get_sup_hw_interface (dm->vnet_main, sw_if_index); + + xd = vec_elt_at_index (dm->devices, hw->dev_instance); + + rv = rte_sched_subport_config (xd->hqos_ht->hqos, subport, &p); + + BAD_SW_IF_INDEX_LABEL; + + REPLY_MACRO (VL_API_SW_INTERFACE_SET_DPDK_HQOS_SUBPORT_REPLY); +} + +static void *vl_api_sw_interface_set_dpdk_hqos_subport_t_print + (vl_api_sw_interface_set_dpdk_hqos_subport_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: sw_interface_set_dpdk_hqos_subport "); + + s = format (s, "sw_if_index %u ", ntohl (mp->sw_if_index)); + + s = + format (s, + "subport %u rate %u bkt_size %u tc0 %u tc1 %u tc2 %u tc3 %u period %u", + ntohl (mp->subport), ntohl (mp->tb_rate), ntohl (mp->tb_size), + ntohl (mp->tc_rate[0]), ntohl (mp->tc_rate[1]), + ntohl (mp->tc_rate[2]), ntohl (mp->tc_rate[3]), + ntohl (mp->tc_period)); + + FINISH; +} + +static void + vl_api_sw_interface_set_dpdk_hqos_tctbl_t_handler + (vl_api_sw_interface_set_dpdk_hqos_tctbl_t * mp) +{ + vl_api_sw_interface_set_dpdk_hqos_tctbl_reply_t *rmp; + int rv = 0; + + dpdk_main_t *dm = &dpdk_main; + vlib_thread_main_t *tm = vlib_get_thread_main (); + dpdk_device_t *xd; + + u32 sw_if_index = ntohl (mp->sw_if_index); + u32 entry = ntohl (mp->entry); + u32 tc = ntohl (mp->tc); + u32 queue = ntohl (mp->queue); + u32 val, i; + + vnet_hw_interface_t *hw; + + VALIDATE_SW_IF_INDEX (mp); + + /* hw_if & dpdk device */ + hw = vnet_get_sup_hw_interface (dm->vnet_main, sw_if_index); + + xd = vec_elt_at_index (dm->devices, hw->dev_instance); + + if (tc >= RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE) + { + clib_warning ("invalid traffic class !!"); + rv = VNET_API_ERROR_INVALID_VALUE; + goto done; + } + if (queue >= RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS) + { + clib_warning ("invalid queue !!"); + rv = VNET_API_ERROR_INVALID_VALUE; + goto done; + } + + /* Detect the set of worker threads */ + uword *p = hash_get_mem (tm->thread_registrations_by_name, "workers"); + + if (p == 0) + { + clib_warning ("worker thread registration AWOL !!"); + rv = VNET_API_ERROR_INVALID_VALUE_2; + goto done; + } + + vlib_thread_registration_t *tr = (vlib_thread_registration_t *) p[0]; + int worker_thread_first = tr->first_index; + int worker_thread_count = tr->count; + + val = tc * RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS + queue; + for (i = 0; i < worker_thread_count; i++) + xd->hqos_wt[worker_thread_first + i].hqos_tc_table[entry] = val; + + BAD_SW_IF_INDEX_LABEL; +done: + + REPLY_MACRO (VL_API_SW_INTERFACE_SET_DPDK_HQOS_TCTBL_REPLY); +} + +static void *vl_api_sw_interface_set_dpdk_hqos_tctbl_t_print + (vl_api_sw_interface_set_dpdk_hqos_tctbl_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: sw_interface_set_dpdk_hqos_tctbl "); + + s = format (s, "sw_if_index %u ", ntohl (mp->sw_if_index)); + + s = format (s, "entry %u tc %u queue %u", + ntohl (mp->entry), ntohl (mp->tc), ntohl (mp->queue)); + + FINISH; +} + +#define foreach_dpdk_plugin_api_msg \ +_(SW_INTERFACE_SET_DPDK_HQOS_PIPE, sw_interface_set_dpdk_hqos_pipe) \ +_(SW_INTERFACE_SET_DPDK_HQOS_SUBPORT, sw_interface_set_dpdk_hqos_subport) \ +_(SW_INTERFACE_SET_DPDK_HQOS_TCTBL, sw_interface_set_dpdk_hqos_tctbl) + +/* Set up the API message handling tables */ +static clib_error_t * +dpdk_plugin_api_hookup (vlib_main_t * vm) +{ + dpdk_main_t *dm __attribute__ ((unused)) = &dpdk_main; +#define _(N,n) \ + vl_msg_api_set_handlers((VL_API_##N + dm->msg_id_base), \ + #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_dpdk_plugin_api_msg; +#undef _ + return 0; +} + +#define vl_msg_name_crc_list +#include <dpdk/api/dpdk_all_api_h.h> +#undef vl_msg_name_crc_list + +static void +setup_message_id_table (dpdk_main_t * dm, api_main_t * am) +{ +#define _(id,n,crc) \ + vl_msg_api_add_msg_name_crc (am, #n "_" #crc, id + dm->msg_id_base); + foreach_vl_msg_name_crc_dpdk; +#undef _ +} + +// TODO +/* +static void plugin_custom_dump_configure (dpdk_main_t * dm) +{ +#define _(n,f) dm->api_main->msg_print_handlers \ + [VL_API_##n + dm->msg_id_base] \ + = (void *) vl_api_##f##_t_print; + foreach_dpdk_plugin_api_msg; +#undef _ +} +*/ +/* force linker to link functions used by vlib and declared weak */ + +static clib_error_t * +dpdk_api_init (vlib_main_t * vm) +{ + dpdk_main_t *dm = &dpdk_main; + clib_error_t *error = 0; + + /* init CLI */ + if ((error = vlib_call_init_function (vm, dpdk_init))) + return error; + + u8 *name; + name = format (0, "dpdk_%08x%c", api_version, 0); + + /* Ask for a correctly-sized block of API message decode slots */ + dm->msg_id_base = vl_msg_api_get_msg_ids + ((char *) name, VL_MSG_FIRST_AVAILABLE); + vec_free (name); + + error = dpdk_plugin_api_hookup (vm); + + /* Add our API messages to the global name_crc hash table */ + setup_message_id_table (dm, &api_main); + +// TODO +// plugin_custom_dump_configure (dm); + + return error; +} + +VLIB_INIT_FUNCTION (dpdk_api_init); + + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/dpdk/api/dpdk_msg_enum.h b/src/plugins/dpdk/api/dpdk_msg_enum.h new file mode 100644 index 00000000..952ce6ad --- /dev/null +++ b/src/plugins/dpdk/api/dpdk_msg_enum.h @@ -0,0 +1,31 @@ + +/* + * dpdk_msg_enum.h - skeleton vpp engine plug-in message enumeration + * + * Copyright (c) <current-year> <your-organization> + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef included_dpdk_msg_enum_h +#define included_dpdk_msg_enum_h + +#include <vppinfra/byte_order.h> + +#define vl_msg_id(n,h) n, +typedef enum { +#include <dpdk/api/dpdk_all_api_h.h> + /* We'll want to know how many messages IDs we need... */ + VL_MSG_FIRST_AVAILABLE, +} vl_msg_id_t; +#undef vl_msg_id + +#endif /* included_dpdk_msg_enum_h */ diff --git a/src/plugins/dpdk/api/dpdk_test.c b/src/plugins/dpdk/api/dpdk_test.c new file mode 100644 index 00000000..ea17e5d0 --- /dev/null +++ b/src/plugins/dpdk/api/dpdk_test.c @@ -0,0 +1,397 @@ + +/* + * dpdk_test.c - skeleton vpp-api-test plug-in + * + * Copyright (c) <current-year> <your-organization> + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vat/vat.h> +#include <vlibapi/api.h> +#include <vlibmemory/api.h> +#include <vlibsocket/api.h> +#include <vppinfra/error.h> +#include <vnet/ip/ip.h> + +uword unformat_sw_if_index (unformat_input_t * input, va_list * args); + +/* Declare message IDs */ +#include <dpdk/api/dpdk_msg_enum.h> + +/* define message structures */ +#define vl_typedefs +#include <dpdk/api/dpdk.api.h> +#undef vl_typedefs + +/* declare message handlers for each api */ + +#define vl_endianfun /* define message structures */ +#include <dpdk/api/dpdk.api.h> +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) +#define vl_printfun +#include <dpdk/api/dpdk.api.h> +#undef vl_printfun + +/* Get the API version number. */ +#define vl_api_version(n,v) static u32 api_version=(v); +#include <dpdk/api/dpdk.api.h> +#undef vl_api_version + +typedef struct { + /* API message ID base */ + u16 msg_id_base; + vat_main_t *vat_main; +} dpdk_test_main_t; + +dpdk_test_main_t dpdk_test_main; + +#define foreach_standard_reply_retval_handler \ +_(sw_interface_set_dpdk_hqos_pipe_reply) \ +_(sw_interface_set_dpdk_hqos_subport_reply) \ +_(sw_interface_set_dpdk_hqos_tctbl_reply) + +#define _(n) \ + static void vl_api_##n##_t_handler \ + (vl_api_##n##_t * mp) \ + { \ + vat_main_t * vam = dpdk_test_main.vat_main; \ + i32 retval = ntohl(mp->retval); \ + if (vam->async_mode) { \ + vam->async_errors += (retval < 0); \ + } else { \ + vam->retval = retval; \ + vam->result_ready = 1; \ + } \ + } +foreach_standard_reply_retval_handler; +#undef _ + +/* + * Table of message reply handlers, must include boilerplate handlers + * we just generated + */ +#define foreach_vpe_api_reply_msg \ +_(SW_INTERFACE_SET_DPDK_HQOS_PIPE_REPLY, \ + sw_interface_set_dpdk_hqos_pipe_reply) \ +_(SW_INTERFACE_SET_DPDK_HQOS_SUBPORT_REPLY, \ + sw_interface_set_dpdk_hqos_subport_reply) \ +_(SW_INTERFACE_SET_DPDK_HQOS_TCTBL_REPLY, \ + sw_interface_set_dpdk_hqos_tctbl_reply) + +/* M: construct, but don't yet send a message */ +#define M(T,t) \ +do { \ + vam->result_ready = 0; \ + mp = vl_msg_api_alloc(sizeof(*mp)); \ + memset (mp, 0, sizeof (*mp)); \ + mp->_vl_msg_id = ntohs (VL_API_##T + dm->msg_id_base); \ + mp->client_index = vam->my_client_index; \ +} while(0); + +#define M2(T,t,n) \ +do { \ + vam->result_ready = 0; \ + mp = vl_msg_api_alloc(sizeof(*mp)+(n)); \ + memset (mp, 0, sizeof (*mp)); \ + mp->_vl_msg_id = ntohs (VL_API_##T + dm->msg_id_base); \ + mp->client_index = vam->my_client_index; \ +} while(0); + +/* S: send a message */ +#define S (vl_msg_api_send_shmem (vam->vl_input_queue, (u8 *)&mp)) + +/* W: wait for results, with timeout */ +#define W \ +do { \ + timeout = vat_time_now (vam) + 1.0; \ + \ + while (vat_time_now (vam) < timeout) { \ + if (vam->result_ready == 1) { \ + return (vam->retval); \ + } \ + } \ + return -99; \ +} while(0); + +static int +api_sw_interface_set_dpdk_hqos_pipe (vat_main_t * vam) +{ + dpdk_test_main_t * dm = &dpdk_test_main; + unformat_input_t *i = vam->input; + vl_api_sw_interface_set_dpdk_hqos_pipe_t *mp; + f64 timeout; + u32 sw_if_index; + u8 sw_if_index_set = 0; + u32 subport; + u8 subport_set = 0; + u32 pipe; + u8 pipe_set = 0; + u32 profile; + u8 profile_set = 0; + + /* Parse args required to build the message */ + while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT) + { + if (unformat (i, "rx sw_if_index %u", &sw_if_index)) + sw_if_index_set = 1; + else if (unformat (i, "subport %u", &subport)) + subport_set = 1; + else if (unformat (i, "pipe %u", &pipe)) + pipe_set = 1; + else if (unformat (i, "profile %u", &profile)) + profile_set = 1; + else + break; + } + + if (sw_if_index_set == 0) + { + errmsg ("missing interface name or sw_if_index"); + return -99; + } + + if (subport_set == 0) + { + errmsg ("missing subport "); + return -99; + } + + if (pipe_set == 0) + { + errmsg ("missing pipe"); + return -99; + } + + if (profile_set == 0) + { + errmsg ("missing profile"); + return -99; + } + + M (SW_INTERFACE_SET_DPDK_HQOS_PIPE, sw_interface_set_dpdk_hqos_pipe); + + mp->sw_if_index = ntohl (sw_if_index); + mp->subport = ntohl (subport); + mp->pipe = ntohl (pipe); + mp->profile = ntohl (profile); + + + S; + W; + /* NOTREACHED */ + return 0; +} + +static int +api_sw_interface_set_dpdk_hqos_subport (vat_main_t * vam) +{ + dpdk_test_main_t * dm = &dpdk_test_main; + unformat_input_t *i = vam->input; + vl_api_sw_interface_set_dpdk_hqos_subport_t *mp; + f64 timeout; + u32 sw_if_index; + u8 sw_if_index_set = 0; + u32 subport; + u8 subport_set = 0; + u32 tb_rate = 1250000000; /* 10GbE */ + u32 tb_size = 1000000; + u32 tc_rate[] = { 1250000000, 1250000000, 1250000000, 1250000000 }; + u32 tc_period = 10; + + /* Parse args required to build the message */ + while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT) + { + if (unformat (i, "rx sw_if_index %u", &sw_if_index)) + sw_if_index_set = 1; + else if (unformat (i, "subport %u", &subport)) + subport_set = 1; + else if (unformat (i, "rate %u", &tb_rate)) + { + u32 tc_id; + + for (tc_id = 0; tc_id < (sizeof (tc_rate) / sizeof (tc_rate[0])); + tc_id++) + tc_rate[tc_id] = tb_rate; + } + else if (unformat (i, "bktsize %u", &tb_size)) + ; + else if (unformat (i, "tc0 %u", &tc_rate[0])) + ; + else if (unformat (i, "tc1 %u", &tc_rate[1])) + ; + else if (unformat (i, "tc2 %u", &tc_rate[2])) + ; + else if (unformat (i, "tc3 %u", &tc_rate[3])) + ; + else if (unformat (i, "period %u", &tc_period)) + ; + else + break; + } + + if (sw_if_index_set == 0) + { + errmsg ("missing interface name or sw_if_index"); + return -99; + } + + if (subport_set == 0) + { + errmsg ("missing subport "); + return -99; + } + + M (SW_INTERFACE_SET_DPDK_HQOS_SUBPORT, sw_interface_set_dpdk_hqos_subport); + + mp->sw_if_index = ntohl (sw_if_index); + mp->subport = ntohl (subport); + mp->tb_rate = ntohl (tb_rate); + mp->tb_size = ntohl (tb_size); + mp->tc_rate[0] = ntohl (tc_rate[0]); + mp->tc_rate[1] = ntohl (tc_rate[1]); + mp->tc_rate[2] = ntohl (tc_rate[2]); + mp->tc_rate[3] = ntohl (tc_rate[3]); + mp->tc_period = ntohl (tc_period); + + S; + W; + /* NOTREACHED */ + return 0; +} + +static int +api_sw_interface_set_dpdk_hqos_tctbl (vat_main_t * vam) +{ + dpdk_test_main_t * dm = &dpdk_test_main; + unformat_input_t *i = vam->input; + vl_api_sw_interface_set_dpdk_hqos_tctbl_t *mp; + f64 timeout; + u32 sw_if_index; + u8 sw_if_index_set = 0; + u8 entry_set = 0; + u8 tc_set = 0; + u8 queue_set = 0; + u32 entry, tc, queue; + + /* Parse args required to build the message */ + while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT) + { + if (unformat (i, "rx sw_if_index %u", &sw_if_index)) + sw_if_index_set = 1; + else if (unformat (i, "entry %d", &entry)) + entry_set = 1; + else if (unformat (i, "tc %d", &tc)) + tc_set = 1; + else if (unformat (i, "queue %d", &queue)) + queue_set = 1; + else + break; + } + + if (sw_if_index_set == 0) + { + errmsg ("missing interface name or sw_if_index"); + return -99; + } + + if (entry_set == 0) + { + errmsg ("missing entry "); + return -99; + } + + if (tc_set == 0) + { + errmsg ("missing traffic class "); + return -99; + } + + if (queue_set == 0) + { + errmsg ("missing queue "); + return -99; + } + + M (SW_INTERFACE_SET_DPDK_HQOS_TCTBL, sw_interface_set_dpdk_hqos_tctbl); + + mp->sw_if_index = ntohl (sw_if_index); + mp->entry = ntohl (entry); + mp->tc = ntohl (tc); + mp->queue = ntohl (queue); + + S; + W; + /* NOTREACHED */ + return 0; +} + +/* + * List of messages that the api test plugin sends, + * and that the data plane plugin processes + */ +#define foreach_vpe_api_msg \ +_(sw_interface_set_dpdk_hqos_pipe, \ + "rx sw_if_index <id> subport <subport-id> pipe <pipe-id>\n" \ + "profile <profile-id>\n") \ +_(sw_interface_set_dpdk_hqos_subport, \ + "rx sw_if_index <id> subport <subport-id> [rate <n>]\n" \ + "[bktsize <n>] [tc0 <n>] [tc1 <n>] [tc2 <n>] [tc3 <n>] [period <n>]\n") \ +_(sw_interface_set_dpdk_hqos_tctbl, \ + "rx sw_if_index <id> entry <n> tc <n> queue <n>\n") + +static void dpdk_api_hookup (vat_main_t *vam) +{ + dpdk_test_main_t * dm __attribute__((unused)) = &dpdk_test_main; + /* Hook up handlers for replies from the data plane plug-in */ +#define _(N,n) \ + vl_msg_api_set_handlers((VL_API_##N + dm->msg_id_base), \ + #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_vpe_api_reply_msg; +#undef _ + + /* API messages we can send */ +#define _(n,h) hash_set_mem (vam->function_by_name, #n, api_##n); + foreach_vpe_api_msg; +#undef _ + + /* Help strings */ +#define _(n,h) hash_set_mem (vam->help_by_name, #n, h); + foreach_vpe_api_msg; +#undef _ +} + +clib_error_t * vat_plugin_register (vat_main_t *vam) +{ + dpdk_test_main_t * dm = &dpdk_test_main; + u8 * name; + + dm->vat_main = vam; + + /* Ask the vpp engine for the first assigned message-id */ + name = format (0, "dpdk_%08x%c", api_version, 0); + dm->msg_id_base = vl_client_get_first_plugin_msg_id ((char *) name); + + if (dm->msg_id_base != (u16) ~0) + dpdk_api_hookup (vam); + + vec_free(name); + + return 0; +} diff --git a/src/plugins/dpdk/buffer.c b/src/plugins/dpdk/buffer.c new file mode 100644 index 00000000..02a11b83 --- /dev/null +++ b/src/plugins/dpdk/buffer.c @@ -0,0 +1,575 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * buffer.c: allocate/free network buffers. + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * @file + * + * Allocate/free network buffers. + */ + +#include <rte_config.h> + +#include <rte_common.h> +#include <rte_log.h> +#include <rte_memory.h> +#include <rte_memzone.h> +#include <rte_tailq.h> +#include <rte_eal.h> +#include <rte_per_lcore.h> +#include <rte_launch.h> +#include <rte_atomic.h> +#include <rte_cycles.h> +#include <rte_prefetch.h> +#include <rte_lcore.h> +#include <rte_per_lcore.h> +#include <rte_branch_prediction.h> +#include <rte_interrupts.h> +#include <rte_pci.h> +#include <rte_random.h> +#include <rte_debug.h> +#include <rte_ether.h> +#include <rte_ethdev.h> +#include <rte_ring.h> +#include <rte_mempool.h> +#include <rte_mbuf.h> +#include <rte_version.h> + +#include <vlib/vlib.h> +#include <vnet/vnet.h> +#include <dpdk/device/dpdk.h> +#include <dpdk/device/dpdk_priv.h> + + +STATIC_ASSERT (VLIB_BUFFER_PRE_DATA_SIZE == RTE_PKTMBUF_HEADROOM, + "VLIB_BUFFER_PRE_DATA_SIZE must be equal to RTE_PKTMBUF_HEADROOM"); + +static_always_inline void +dpdk_rte_pktmbuf_free (vlib_main_t * vm, vlib_buffer_t * b) +{ + vlib_buffer_t *hb = b; + struct rte_mbuf *mb; + u32 next, flags; + mb = rte_mbuf_from_vlib_buffer (hb); + +next: + flags = b->flags; + next = b->next_buffer; + mb = rte_mbuf_from_vlib_buffer (b); + + if (PREDICT_FALSE (b->n_add_refs)) + { + rte_mbuf_refcnt_update (mb, b->n_add_refs); + b->n_add_refs = 0; + } + + rte_pktmbuf_free_seg (mb); + + if (flags & VLIB_BUFFER_NEXT_PRESENT) + { + b = vlib_get_buffer (vm, next); + goto next; + } +} + +static void +del_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * f) +{ + u32 i; + vlib_buffer_t *b; + + for (i = 0; i < vec_len (f->buffers); i++) + { + b = vlib_get_buffer (vm, f->buffers[i]); + dpdk_rte_pktmbuf_free (vm, b); + } + + vec_free (f->name); + vec_free (f->buffers); +} + +/* Add buffer free list. */ +static void +dpdk_buffer_delete_free_list (vlib_main_t * vm, u32 free_list_index) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + vlib_buffer_free_list_t *f; + u32 merge_index; + int i; + + ASSERT (vlib_get_thread_index () == 0); + + f = vlib_buffer_get_free_list (vm, free_list_index); + + merge_index = vlib_buffer_get_free_list_with_size (vm, f->n_data_bytes); + if (merge_index != ~0 && merge_index != free_list_index) + { + vlib_buffer_merge_free_lists (pool_elt_at_index + (bm->buffer_free_list_pool, merge_index), + f); + } + + del_free_list (vm, f); + + /* Poison it. */ + memset (f, 0xab, sizeof (f[0])); + + pool_put (bm->buffer_free_list_pool, f); + + for (i = 1; i < vec_len (vlib_mains); i++) + { + bm = vlib_mains[i]->buffer_main; + f = vlib_buffer_get_free_list (vlib_mains[i], free_list_index);; + memset (f, 0xab, sizeof (f[0])); + pool_put (bm->buffer_free_list_pool, f); + } +} + +/* Make sure free list has at least given number of free buffers. */ +static uword +fill_free_list (vlib_main_t * vm, + vlib_buffer_free_list_t * fl, uword min_free_buffers) +{ + dpdk_main_t *dm = &dpdk_main; + vlib_buffer_t *b0, *b1, *b2, *b3; + int n, i; + u32 bi0, bi1, bi2, bi3; + unsigned socket_id = rte_socket_id (); + struct rte_mempool *rmp = dm->pktmbuf_pools[socket_id]; + struct rte_mbuf *mb0, *mb1, *mb2, *mb3; + + /* Too early? */ + if (PREDICT_FALSE (rmp == 0)) + return 0; + + /* Already have enough free buffers on free list? */ + n = min_free_buffers - vec_len (fl->buffers); + if (n <= 0) + return min_free_buffers; + + /* Always allocate round number of buffers. */ + n = round_pow2 (n, CLIB_CACHE_LINE_BYTES / sizeof (u32)); + + /* Always allocate new buffers in reasonably large sized chunks. */ + n = clib_max (n, fl->min_n_buffers_each_physmem_alloc); + + vec_validate_aligned (vm->mbuf_alloc_list, n - 1, CLIB_CACHE_LINE_BYTES); + + if (rte_mempool_get_bulk (rmp, vm->mbuf_alloc_list, n) < 0) + return 0; + + _vec_len (vm->mbuf_alloc_list) = n; + + i = 0; + + while (i < (n - 7)) + { + vlib_prefetch_buffer_header (vlib_buffer_from_rte_mbuf + (vm->mbuf_alloc_list[i + 4]), STORE); + vlib_prefetch_buffer_header (vlib_buffer_from_rte_mbuf + (vm->mbuf_alloc_list[i + 5]), STORE); + vlib_prefetch_buffer_header (vlib_buffer_from_rte_mbuf + (vm->mbuf_alloc_list[i + 6]), STORE); + vlib_prefetch_buffer_header (vlib_buffer_from_rte_mbuf + (vm->mbuf_alloc_list[i + 7]), STORE); + + mb0 = vm->mbuf_alloc_list[i]; + mb1 = vm->mbuf_alloc_list[i + 1]; + mb2 = vm->mbuf_alloc_list[i + 2]; + mb3 = vm->mbuf_alloc_list[i + 3]; + + b0 = vlib_buffer_from_rte_mbuf (mb0); + b1 = vlib_buffer_from_rte_mbuf (mb1); + b2 = vlib_buffer_from_rte_mbuf (mb2); + b3 = vlib_buffer_from_rte_mbuf (mb3); + + bi0 = vlib_get_buffer_index (vm, b0); + bi1 = vlib_get_buffer_index (vm, b1); + bi2 = vlib_get_buffer_index (vm, b2); + bi3 = vlib_get_buffer_index (vm, b3); + + vec_add1_aligned (fl->buffers, bi0, CLIB_CACHE_LINE_BYTES); + vec_add1_aligned (fl->buffers, bi1, CLIB_CACHE_LINE_BYTES); + vec_add1_aligned (fl->buffers, bi2, CLIB_CACHE_LINE_BYTES); + vec_add1_aligned (fl->buffers, bi3, CLIB_CACHE_LINE_BYTES); + + vlib_buffer_init_for_free_list (b0, fl); + vlib_buffer_init_for_free_list (b1, fl); + vlib_buffer_init_for_free_list (b2, fl); + vlib_buffer_init_for_free_list (b3, fl); + + if (fl->buffer_init_function) + { + fl->buffer_init_function (vm, fl, &bi0, 1); + fl->buffer_init_function (vm, fl, &bi1, 1); + fl->buffer_init_function (vm, fl, &bi2, 1); + fl->buffer_init_function (vm, fl, &bi3, 1); + } + i += 4; + } + + while (i < n) + { + mb0 = vm->mbuf_alloc_list[i]; + + b0 = vlib_buffer_from_rte_mbuf (mb0); + bi0 = vlib_get_buffer_index (vm, b0); + + vec_add1_aligned (fl->buffers, bi0, CLIB_CACHE_LINE_BYTES); + + vlib_buffer_init_for_free_list (b0, fl); + + if (fl->buffer_init_function) + fl->buffer_init_function (vm, fl, &bi0, 1); + i++; + } + + fl->n_alloc += n; + + return n; +} + +static u32 +alloc_from_free_list (vlib_main_t * vm, + vlib_buffer_free_list_t * free_list, + u32 * alloc_buffers, u32 n_alloc_buffers) +{ + u32 *dst, *src; + uword len, n_filled; + + dst = alloc_buffers; + + n_filled = fill_free_list (vm, free_list, n_alloc_buffers); + if (n_filled == 0) + return 0; + + len = vec_len (free_list->buffers); + ASSERT (len >= n_alloc_buffers); + + src = free_list->buffers + len - n_alloc_buffers; + clib_memcpy (dst, src, n_alloc_buffers * sizeof (u32)); + + _vec_len (free_list->buffers) -= n_alloc_buffers; + + return n_alloc_buffers; +} + +/* Allocate a given number of buffers into given array. + Returns number actually allocated which will be either zero or + number requested. */ +u32 +dpdk_buffer_alloc (vlib_main_t * vm, u32 * buffers, u32 n_buffers) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + + return alloc_from_free_list + (vm, + pool_elt_at_index (bm->buffer_free_list_pool, + VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX), + buffers, n_buffers); +} + + +u32 +dpdk_buffer_alloc_from_free_list (vlib_main_t * vm, + u32 * buffers, + u32 n_buffers, u32 free_list_index) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + vlib_buffer_free_list_t *f; + f = pool_elt_at_index (bm->buffer_free_list_pool, free_list_index); + return alloc_from_free_list (vm, f, buffers, n_buffers); +} + +static_always_inline void +vlib_buffer_free_inline (vlib_main_t * vm, + u32 * buffers, u32 n_buffers, u32 follow_buffer_next) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + vlib_buffer_free_list_t *fl; + u32 fi; + int i; + u32 (*cb) (vlib_main_t * vm, u32 * buffers, u32 n_buffers, + u32 follow_buffer_next); + + cb = bm->buffer_free_callback; + + if (PREDICT_FALSE (cb != 0)) + n_buffers = (*cb) (vm, buffers, n_buffers, follow_buffer_next); + + if (!n_buffers) + return; + + for (i = 0; i < n_buffers; i++) + { + vlib_buffer_t *b; + + b = vlib_get_buffer (vm, buffers[i]); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b); + fl = vlib_buffer_get_buffer_free_list (vm, b, &fi); + + /* The only current use of this callback: multicast recycle */ + if (PREDICT_FALSE (fl->buffers_added_to_freelist_function != 0)) + { + int j; + + vlib_buffer_add_to_free_list + (vm, fl, buffers[i], (b->flags & VLIB_BUFFER_RECYCLE) == 0); + + for (j = 0; j < vec_len (bm->announce_list); j++) + { + if (fl == bm->announce_list[j]) + goto already_announced; + } + vec_add1 (bm->announce_list, fl); + already_announced: + ; + } + else + { + if (PREDICT_TRUE ((b->flags & VLIB_BUFFER_RECYCLE) == 0)) + dpdk_rte_pktmbuf_free (vm, b); + } + } + if (vec_len (bm->announce_list)) + { + vlib_buffer_free_list_t *fl; + for (i = 0; i < vec_len (bm->announce_list); i++) + { + fl = bm->announce_list[i]; + fl->buffers_added_to_freelist_function (vm, fl); + } + _vec_len (bm->announce_list) = 0; + } +} + +static void +dpdk_buffer_free (vlib_main_t * vm, u32 * buffers, u32 n_buffers) +{ + vlib_buffer_free_inline (vm, buffers, n_buffers, /* follow_buffer_next */ + 1); +} + +static void +dpdk_buffer_free_no_next (vlib_main_t * vm, u32 * buffers, u32 n_buffers) +{ + vlib_buffer_free_inline (vm, buffers, n_buffers, /* follow_buffer_next */ + 0); +} + +static void +dpdk_packet_template_init (vlib_main_t * vm, + void *vt, + void *packet_data, + uword n_packet_data_bytes, + uword min_n_buffers_each_physmem_alloc, u8 * name) +{ + vlib_packet_template_t *t = (vlib_packet_template_t *) vt; + + vlib_worker_thread_barrier_sync (vm); + memset (t, 0, sizeof (t[0])); + + vec_add (t->packet_data, packet_data, n_packet_data_bytes); + + vlib_worker_thread_barrier_release (vm); +} + +clib_error_t * +dpdk_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs, + unsigned socket_id) +{ + dpdk_main_t *dm = &dpdk_main; + struct rte_mempool *rmp; + int i; + + vec_validate_aligned (dm->pktmbuf_pools, socket_id, CLIB_CACHE_LINE_BYTES); + + /* pool already exists, nothing to do */ + if (dm->pktmbuf_pools[socket_id]) + return 0; + + u8 *pool_name = format (0, "mbuf_pool_socket%u%c", socket_id, 0); + + rmp = rte_pktmbuf_pool_create ((char *) pool_name, /* pool name */ + num_mbufs, /* number of mbufs */ + 512, /* cache size */ + VLIB_BUFFER_HDR_SIZE, /* priv size */ + VLIB_BUFFER_PRE_DATA_SIZE + VLIB_BUFFER_DATA_SIZE, /* dataroom size */ + socket_id); /* cpu socket */ + + if (rmp) + { + { + struct rte_mempool_memhdr *memhdr; + + STAILQ_FOREACH (memhdr, &rmp->mem_list, next) + vlib_buffer_add_mem_range (vm, (uword) memhdr->addr, memhdr->len); + } + if (rmp) + { + dm->pktmbuf_pools[socket_id] = rmp; + vec_free (pool_name); + return 0; + } + } + + vec_free (pool_name); + + /* no usable pool for this socket, try to use pool from another one */ + for (i = 0; i < vec_len (dm->pktmbuf_pools); i++) + { + if (dm->pktmbuf_pools[i]) + { + clib_warning + ("WARNING: Failed to allocate mempool for CPU socket %u. " + "Threads running on socket %u will use socket %u mempool.", + socket_id, socket_id, i); + dm->pktmbuf_pools[socket_id] = dm->pktmbuf_pools[i]; + return 0; + } + } + + return clib_error_return (0, "failed to allocate mempool on socket %u", + socket_id); +} + +#if CLIB_DEBUG > 0 + +u32 *vlib_buffer_state_validation_lock; +uword *vlib_buffer_state_validation_hash; +void *vlib_buffer_state_heap; + +static clib_error_t * +buffer_state_validation_init (vlib_main_t * vm) +{ + void *oldheap; + + vlib_buffer_state_heap = mheap_alloc (0, 10 << 20); + + oldheap = clib_mem_set_heap (vlib_buffer_state_heap); + + vlib_buffer_state_validation_hash = hash_create (0, sizeof (uword)); + vec_validate_aligned (vlib_buffer_state_validation_lock, 0, + CLIB_CACHE_LINE_BYTES); + clib_mem_set_heap (oldheap); + return 0; +} + +VLIB_INIT_FUNCTION (buffer_state_validation_init); +#endif + +#if CLI_DEBUG +struct dpdk_validate_buf_result +{ + u32 invalid; + u32 uninitialized; +}; + +#define DPDK_TRAJECTORY_POISON 31 + +static void +dpdk_buffer_validate_trajectory (struct rte_mempool *mp, void *opaque, + void *obj, unsigned obj_idx) +{ + vlib_buffer_t *b; + struct dpdk_validate_buf_result *counter = opaque; + b = vlib_buffer_from_rte_mbuf ((struct rte_mbuf *) obj); + if (b->pre_data[0] != 0) + { + if (b->pre_data[0] == DPDK_TRAJECTORY_POISON) + counter->uninitialized++; + else + counter->invalid++; + } +} + +int +dpdk_buffer_validate_trajectory_all (u32 * uninitialized) +{ + dpdk_main_t *dm = &dpdk_main; + struct dpdk_validate_buf_result counter = { 0 }; + int i; + + for (i = 0; i < vec_len (dm->pktmbuf_pools); i++) + rte_mempool_obj_iter (dm->pktmbuf_pools[i], + dpdk_buffer_validate_trajectory, &counter); + if (uninitialized) + *uninitialized = counter.uninitialized; + return counter.invalid; +} + +static void +dpdk_buffer_poison_trajectory (struct rte_mempool *mp, void *opaque, + void *obj, unsigned obj_idx) +{ + vlib_buffer_t *b; + b = vlib_buffer_from_rte_mbuf ((struct rte_mbuf *) obj); + b->pre_data[0] = DPDK_TRAJECTORY_POISON; +} + +void +dpdk_buffer_poison_trajectory_all (void) +{ + dpdk_main_t *dm = &dpdk_main; + int i; + + for (i = 0; i < vec_len (dm->pktmbuf_pools); i++) + rte_mempool_obj_iter (dm->pktmbuf_pools[i], dpdk_buffer_poison_trajectory, + 0); +} +#endif + +/* *INDENT-OFF* */ +VLIB_BUFFER_REGISTER_CALLBACKS (dpdk, static) = { + .vlib_buffer_alloc_cb = &dpdk_buffer_alloc, + .vlib_buffer_alloc_from_free_list_cb = &dpdk_buffer_alloc_from_free_list, + .vlib_buffer_free_cb = &dpdk_buffer_free, + .vlib_buffer_free_no_next_cb = &dpdk_buffer_free_no_next, + .vlib_packet_template_init_cb = &dpdk_packet_template_init, + .vlib_buffer_delete_free_list_cb = &dpdk_buffer_delete_free_list, +}; +/* *INDENT-ON* */ + +/** @endcond */ +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/dpdk/device/cli.c b/src/plugins/dpdk/device/cli.c new file mode 100644 index 00000000..c9fcea5c --- /dev/null +++ b/src/plugins/dpdk/device/cli.c @@ -0,0 +1,1955 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vnet/vnet.h> +#include <vppinfra/vec.h> +#include <vppinfra/error.h> +#include <vppinfra/format.h> +#include <vppinfra/xxhash.h> + +#include <vnet/ethernet/ethernet.h> +#include <dpdk/device/dpdk.h> +#include <vnet/classify/vnet_classify.h> +#include <vnet/mpls/packet.h> + +#include <dpdk/device/dpdk_priv.h> + +/** + * @file + * @brief CLI for DPDK Abstraction Layer and pcap Tx Trace. + * + * This file contains the source code for CLI for DPDK + * Abstraction Layer and pcap Tx Trace. + */ + + +static clib_error_t * +get_hqos (u32 hw_if_index, u32 subport_id, dpdk_device_t ** xd, + dpdk_device_config_t ** devconf) +{ + dpdk_main_t *dm = &dpdk_main; + vnet_hw_interface_t *hw; + struct rte_eth_dev_info dev_info; + uword *p = 0; + clib_error_t *error = NULL; + + + if (hw_if_index == (u32) ~ 0) + { + error = clib_error_return (0, "please specify valid interface name"); + goto done; + } + + if (subport_id != 0) + { + error = clib_error_return (0, "Invalid subport"); + goto done; + } + + hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index); + *xd = vec_elt_at_index (dm->devices, hw->dev_instance); + + rte_eth_dev_info_get ((*xd)->device_index, &dev_info); + if (dev_info.pci_dev) + { /* bonded interface has no pci info */ + vlib_pci_addr_t pci_addr; + + pci_addr.domain = dev_info.pci_dev->addr.domain; + pci_addr.bus = dev_info.pci_dev->addr.bus; + pci_addr.slot = dev_info.pci_dev->addr.devid; + pci_addr.function = dev_info.pci_dev->addr.function; + + p = + hash_get (dm->conf->device_config_index_by_pci_addr, pci_addr.as_u32); + } + + if (p) + (*devconf) = pool_elt_at_index (dm->conf->dev_confs, p[0]); + else + (*devconf) = &dm->conf->default_devconf; + +done: + return error; +} + +static clib_error_t * +pcap_trace_command_fn (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ +#define PCAP_DEF_PKT_TO_CAPTURE (100) + + unformat_input_t _line_input, *line_input = &_line_input; + dpdk_main_t *dm = &dpdk_main; + u8 *filename; + u8 *chroot_filename = 0; + u32 max = 0; + int enabled = 0; + int errorFlag = 0; + clib_error_t *error = 0; + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "on")) + { + if (dm->tx_pcap_enable == 0) + { + enabled = 1; + } + else + { + vlib_cli_output (vm, "pcap tx capture already on..."); + errorFlag = 1; + break; + } + } + else if (unformat (line_input, "off")) + { + if (dm->tx_pcap_enable) + { + vlib_cli_output (vm, "captured %d pkts...", + dm->pcap_main.n_packets_captured + 1); + if (dm->pcap_main.n_packets_captured) + { + dm->pcap_main.n_packets_to_capture = + dm->pcap_main.n_packets_captured; + error = pcap_write (&dm->pcap_main); + if (error) + clib_error_report (error); + else + vlib_cli_output (vm, "saved to %s...", dm->pcap_filename); + } + + dm->tx_pcap_enable = 0; + } + else + { + vlib_cli_output (vm, "pcap tx capture already off..."); + errorFlag = 1; + break; + } + } + else if (unformat (line_input, "max %d", &max)) + { + if (dm->tx_pcap_enable) + { + vlib_cli_output (vm, + "can't change max value while pcap tx capture active..."); + errorFlag = 1; + break; + } + } + else if (unformat (line_input, "intfc %U", + unformat_vnet_sw_interface, dm->vnet_main, + &dm->pcap_sw_if_index)) + ; + + else if (unformat (line_input, "intfc any")) + { + dm->pcap_sw_if_index = 0; + } + else if (unformat (line_input, "file %s", &filename)) + { + if (dm->tx_pcap_enable) + { + vlib_cli_output (vm, + "can't change file while pcap tx capture active..."); + errorFlag = 1; + break; + } + + /* Brain-police user path input */ + if (strstr ((char *) filename, "..") + || index ((char *) filename, '/')) + { + vlib_cli_output (vm, "illegal characters in filename '%s'", + filename); + vlib_cli_output (vm, + "Hint: Only filename, do not enter directory structure."); + vec_free (filename); + errorFlag = 1; + break; + } + + chroot_filename = format (0, "/tmp/%s%c", filename, 0); + vec_free (filename); + } + else if (unformat (line_input, "status")) + { + if (dm->pcap_sw_if_index == 0) + { + vlib_cli_output (vm, "max is %d for any interface to file %s", + dm-> + pcap_pkts_to_capture ? dm->pcap_pkts_to_capture + : PCAP_DEF_PKT_TO_CAPTURE, + dm-> + pcap_filename ? dm->pcap_filename : (u8 *) + "/tmp/vpe.pcap"); + } + else + { + vlib_cli_output (vm, "max is %d for interface %U to file %s", + dm-> + pcap_pkts_to_capture ? dm->pcap_pkts_to_capture + : PCAP_DEF_PKT_TO_CAPTURE, + format_vnet_sw_if_index_name, dm->vnet_main, + dm->pcap_sw_if_index, + dm-> + pcap_filename ? dm->pcap_filename : (u8 *) + "/tmp/vpe.pcap"); + } + + if (dm->tx_pcap_enable == 0) + { + vlib_cli_output (vm, "pcap tx capture is off..."); + } + else + { + vlib_cli_output (vm, "pcap tx capture is on: %d of %d pkts...", + dm->pcap_main.n_packets_captured, + dm->pcap_main.n_packets_to_capture); + } + break; + } + + else + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + errorFlag = 1; + break; + } + } + unformat_free (line_input); + + + if (errorFlag == 0) + { + /* Since no error, save configured values. */ + if (chroot_filename) + { + if (dm->pcap_filename) + vec_free (dm->pcap_filename); + vec_add1 (chroot_filename, 0); + dm->pcap_filename = chroot_filename; + } + + if (max) + dm->pcap_pkts_to_capture = max; + + + if (enabled) + { + if (dm->pcap_filename == 0) + dm->pcap_filename = format (0, "/tmp/vpe.pcap%c", 0); + + memset (&dm->pcap_main, 0, sizeof (dm->pcap_main)); + dm->pcap_main.file_name = (char *) dm->pcap_filename; + dm->pcap_main.n_packets_to_capture = PCAP_DEF_PKT_TO_CAPTURE; + if (dm->pcap_pkts_to_capture) + dm->pcap_main.n_packets_to_capture = dm->pcap_pkts_to_capture; + + dm->pcap_main.packet_type = PCAP_PACKET_TYPE_ethernet; + dm->tx_pcap_enable = 1; + vlib_cli_output (vm, "pcap tx capture on..."); + } + } + else if (chroot_filename) + vec_free (chroot_filename); + + + return error; +} + +/*? + * This command is used to start or stop a packet capture, or show + * the status of packet capture. + * + * This command has the following optional parameters: + * + * - <b>on|off</b> - Used to start or stop a packet capture. + * + * - <b>max <nn></b> - Depth of local buffer. Once '<em>nn</em>' number + * of packets have been received, buffer is flushed to file. Once another + * '<em>nn</em>' number of packets have been received, buffer is flushed + * to file, overwriting previous write. If not entered, value defaults + * to 100. Can only be updated if packet capture is off. + * + * - <b>intfc <interface>|any</b> - Used to specify a given interface, + * or use '<em>any</em>' to run packet capture on all interfaces. + * '<em>any</em>' is the default if not provided. Settings from a previous + * packet capture are preserved, so '<em>any</em>' can be used to reset + * the interface setting. + * + * - <b>file <name></b> - Used to specify the output filename. The file will + * be placed in the '<em>/tmp</em>' directory, so only the filename is + * supported. Directory should not be entered. If file already exists, file + * will be overwritten. If no filename is provided, '<em>/tmp/vpe.pcap</em>' + * will be used. Can only be updated if packet capture is off. + * + * - <b>status</b> - Displays the current status and configured attributes + * associated with a packet capture. If packet capture is in progress, + * '<em>status</em>' also will return the number of packets currently in + * the local buffer. All additional attributes entered on command line + * with '<em>status</em>' will be ingnored and not applied. + * + * @cliexpar + * Example of how to display the status of a tx packet capture when off: + * @cliexstart{pcap tx trace status} + * max is 100, for any interface to file /tmp/vpe.pcap + * pcap tx capture is off... + * @cliexend + * Example of how to start a tx packet capture: + * @cliexstart{pcap tx trace on max 35 intfc GigabitEthernet0/8/0 file vppTest.pcap} + * pcap tx capture on... + * @cliexend + * Example of how to display the status of a tx packet capture in progress: + * @cliexstart{pcap tx trace status} + * max is 35, for interface GigabitEthernet0/8/0 to file /tmp/vppTest.pcap + * pcap tx capture is on: 20 of 35 pkts... + * @cliexend + * Example of how to stop a tx packet capture: + * @cliexstart{vppctl pcap tx trace off} + * captured 21 pkts... + * saved to /tmp/vppTest.pcap... + * @cliexend +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (pcap_trace_command, static) = { + .path = "pcap tx trace", + .short_help = + "pcap tx trace [on|off] [max <nn>] [intfc <interface>|any] [file <name>] [status]", + .function = pcap_trace_command_fn, +}; +/* *INDENT-ON* */ + + +static clib_error_t * +show_dpdk_buffer (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + struct rte_mempool *rmp; + int i; + + for (i = 0; i < vec_len (dpdk_main.pktmbuf_pools); i++) + { + rmp = dpdk_main.pktmbuf_pools[i]; + if (rmp) + { + unsigned count = rte_mempool_avail_count (rmp); + unsigned free_count = rte_mempool_in_use_count (rmp); + + vlib_cli_output (vm, + "name=\"%s\" available = %7d allocated = %7d total = %7d\n", + rmp->name, (u32) count, (u32) free_count, + (u32) (count + free_count)); + } + else + { + vlib_cli_output (vm, "rte_mempool is NULL (!)\n"); + } + } + return 0; +} + +/*? + * This command displays statistics of each DPDK mempool. + * + * @cliexpar + * Example of how to display DPDK buffer data: + * @cliexstart{show dpdk buffer} + * name="mbuf_pool_socket0" available = 15104 allocated = 1280 total = 16384 + * @cliexend +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_show_dpdk_bufferr,static) = { + .path = "show dpdk buffer", + .short_help = "show dpdk buffer", + .function = show_dpdk_buffer, + .is_mp_safe = 1, +}; +/* *INDENT-ON* */ + +static clib_error_t * +test_dpdk_buffer (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + static u32 *allocated_buffers; + u32 n_alloc = 0; + u32 n_free = 0; + u32 first, actual_alloc; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "allocate %d", &n_alloc)) + ; + else if (unformat (input, "free %d", &n_free)) + ; + else + break; + } + + if (n_free) + { + if (vec_len (allocated_buffers) < n_free) + return clib_error_return (0, "Can't free %d, only %d allocated", + n_free, vec_len (allocated_buffers)); + + first = vec_len (allocated_buffers) - n_free; + vlib_buffer_free (vm, allocated_buffers + first, n_free); + _vec_len (allocated_buffers) = first; + } + if (n_alloc) + { + first = vec_len (allocated_buffers); + vec_validate (allocated_buffers, + vec_len (allocated_buffers) + n_alloc - 1); + + actual_alloc = vlib_buffer_alloc (vm, allocated_buffers + first, + n_alloc); + _vec_len (allocated_buffers) = first + actual_alloc; + + if (actual_alloc < n_alloc) + vlib_cli_output (vm, "WARNING: only allocated %d buffers", + actual_alloc); + } + + vlib_cli_output (vm, "Currently %d buffers allocated", + vec_len (allocated_buffers)); + + if (allocated_buffers && vec_len (allocated_buffers) == 0) + vec_free (allocated_buffers); + + return 0; +} + +/*? + * This command tests the allocation and freeing of DPDK buffers. + * If both '<em>allocate</em>' and '<em>free</em>' are entered on the + * same command, the '<em>free</em>' is executed first. If no + * parameters are provided, this command display how many DPDK buffers + * the test command has allocated. + * + * @cliexpar + * @parblock + * + * Example of how to display how many DPDK buffer test command has allcoated: + * @cliexstart{test dpdk buffer} + * Currently 0 buffers allocated + * @cliexend + * + * Example of how to allocate DPDK buffers using the test command: + * @cliexstart{test dpdk buffer allocate 10} + * Currently 10 buffers allocated + * @cliexend + * + * Example of how to free DPDK buffers allocated by the test command: + * @cliexstart{test dpdk buffer free 10} + * Currently 0 buffers allocated + * @cliexend + * @endparblock +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_test_dpdk_buffer,static) = { + .path = "test dpdk buffer", + .short_help = "test dpdk buffer [allocate <nn>] [free <nn>]", + .function = test_dpdk_buffer, + .is_mp_safe = 1, +}; +/* *INDENT-ON* */ + +static clib_error_t * +set_dpdk_if_desc (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + dpdk_main_t *dm = &dpdk_main; + vnet_hw_interface_t *hw; + dpdk_device_t *xd; + u32 hw_if_index = (u32) ~ 0; + u32 nb_rx_desc = (u32) ~ 0; + u32 nb_tx_desc = (u32) ~ 0; + clib_error_t *error = NULL; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat + (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main, + &hw_if_index)) + ; + else if (unformat (line_input, "tx %d", &nb_tx_desc)) + ; + else if (unformat (line_input, "rx %d", &nb_rx_desc)) + ; + else + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } + } + + if (hw_if_index == (u32) ~ 0) + { + error = clib_error_return (0, "please specify valid interface name"); + goto done; + } + + hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index); + xd = vec_elt_at_index (dm->devices, hw->dev_instance); + + if ((xd->flags & DPDK_DEVICE_FLAG_PMD) == 0) + { + error = + clib_error_return (0, + "number of descriptors can be set only for " + "physical devices"); + goto done; + } + + if ((nb_rx_desc == (u32) ~ 0 || nb_rx_desc == xd->nb_rx_desc) && + (nb_tx_desc == (u32) ~ 0 || nb_tx_desc == xd->nb_tx_desc)) + { + error = clib_error_return (0, "nothing changed"); + goto done; + } + + if (nb_rx_desc != (u32) ~ 0) + xd->nb_rx_desc = nb_rx_desc; + + if (nb_tx_desc != (u32) ~ 0) + xd->nb_tx_desc = nb_tx_desc; + + dpdk_device_setup (xd); + + if (vec_len (xd->errors)) + return clib_error_return (0, "%U", format_dpdk_device_errors, xd); + +done: + unformat_free (line_input); + + return error; +} + +/*? + * This command sets the number of DPDK '<em>rx</em>' and + * '<em>tx</em>' descriptors for the given physical interface. Use + * the command '<em>show hardware-interface</em>' to display the + * current descriptor allocation. + * + * @cliexpar + * Example of how to set the DPDK interface descriptors: + * @cliexcmd{set dpdk interface descriptors GigabitEthernet0/8/0 rx 512 tx 512} +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_set_dpdk_if_desc,static) = { + .path = "set dpdk interface descriptors", + .short_help = "set dpdk interface descriptors <interface> [rx <nn>] [tx <nn>]", + .function = set_dpdk_if_desc, +}; +/* *INDENT-ON* */ + +static int +dpdk_device_queue_sort (void *a1, void *a2) +{ + dpdk_device_and_queue_t *dq1 = a1; + dpdk_device_and_queue_t *dq2 = a2; + + if (dq1->device > dq2->device) + return 1; + else if (dq1->device < dq2->device) + return -1; + else if (dq1->queue_id > dq2->queue_id) + return 1; + else if (dq1->queue_id < dq2->queue_id) + return -1; + else + return 0; +} + + +static clib_error_t * +show_dpdk_if_hqos_placement (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + vlib_thread_main_t *tm = vlib_get_thread_main (); + dpdk_main_t *dm = &dpdk_main; + dpdk_device_and_queue_t *dq; + int cpu; + + if (tm->n_vlib_mains == 1) + vlib_cli_output (vm, "All interfaces are handled by main thread"); + + for (cpu = 0; cpu < vec_len (dm->devices_by_hqos_cpu); cpu++) + { + if (cpu >= dm->hqos_cpu_first_index && + cpu < (dm->hqos_cpu_first_index + dm->hqos_cpu_count)) + vlib_cli_output (vm, "Thread %u (%s at lcore %u):", cpu, + vlib_worker_threads[cpu].name, + vlib_worker_threads[cpu].lcore_id); + + vec_foreach (dq, dm->devices_by_hqos_cpu[cpu]) + { + u32 hw_if_index = dm->devices[dq->device].hw_if_index; + vnet_hw_interface_t *hi = + vnet_get_hw_interface (dm->vnet_main, hw_if_index); + vlib_cli_output (vm, " %v queue %u", hi->name, dq->queue_id); + } + } + return 0; +} + +/*? + * This command is used to display the thread and core each + * DPDK output interface and HQoS queue is assigned too. + * + * @cliexpar + * Example of how to display the DPDK output interface and HQoS queue placement: + * @cliexstart{show dpdk interface hqos placement} + * Thread 1 (vpp_hqos-threads_0 at lcore 3): + * GigabitEthernet0/8/0 queue 0 + * Thread 2 (vpp_hqos-threads_1 at lcore 4): + * GigabitEthernet0/9/0 queue 0 + * @cliexend +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_show_dpdk_if_hqos_placement, static) = { + .path = "show dpdk interface hqos placement", + .short_help = "show dpdk interface hqos placement", + .function = show_dpdk_if_hqos_placement, +}; +/* *INDENT-ON* */ + +static clib_error_t * +set_dpdk_if_hqos_placement (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + dpdk_main_t *dm = &dpdk_main; + dpdk_device_and_queue_t *dq; + vnet_hw_interface_t *hw; + dpdk_device_t *xd; + u32 hw_if_index = (u32) ~ 0; + u32 cpu = (u32) ~ 0; + int i; + clib_error_t *error = NULL; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat + (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main, + &hw_if_index)) + ; + else if (unformat (line_input, "thread %d", &cpu)) + ; + else + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } + } + + if (hw_if_index == (u32) ~ 0) + return clib_error_return (0, "please specify valid interface name"); + + if (cpu < dm->hqos_cpu_first_index || + cpu >= (dm->hqos_cpu_first_index + dm->hqos_cpu_count)) + { + error = clib_error_return (0, "please specify valid thread id"); + goto done; + } + + hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index); + xd = vec_elt_at_index (dm->devices, hw->dev_instance); + + for (i = 0; i < vec_len (dm->devices_by_hqos_cpu); i++) + { + vec_foreach (dq, dm->devices_by_hqos_cpu[i]) + { + if (hw_if_index == dm->devices[dq->device].hw_if_index) + { + if (cpu == i) /* nothing to do */ + goto done; + + vec_del1 (dm->devices_by_hqos_cpu[i], + dq - dm->devices_by_hqos_cpu[i]); + vec_add2 (dm->devices_by_hqos_cpu[cpu], dq, 1); + dq->queue_id = 0; + dq->device = xd->device_index; + + vec_sort_with_function (dm->devices_by_hqos_cpu[i], + dpdk_device_queue_sort); + + vec_sort_with_function (dm->devices_by_hqos_cpu[cpu], + dpdk_device_queue_sort); + + goto done; + } + } + } + + error = clib_error_return (0, "not found"); + +done: + unformat_free (line_input); + + return error; +} + +/*? + * This command is used to assign a given DPDK output interface and + * HQoS queue to a different thread. This will not create a thread, + * so the thread must already exist. Use '<em>/etc/vpp/startup.conf</em>' + * for the initial thread creation. See @ref qos_doc for more details. + * + * @cliexpar + * Example of how to display the DPDK output interface and HQoS queue placement: + * @cliexstart{show dpdk interface hqos placement} + * Thread 1 (vpp_hqos-threads_0 at lcore 3): + * GigabitEthernet0/8/0 queue 0 + * Thread 2 (vpp_hqos-threads_1 at lcore 4): + * GigabitEthernet0/9/0 queue 0 + * @cliexend + * Example of how to assign a DPDK output interface and HQoS queue to a thread: + * @cliexcmd{set dpdk interface hqos placement GigabitEthernet0/8/0 thread 2} +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_set_dpdk_if_hqos_placement, static) = { + .path = "set dpdk interface hqos placement", + .short_help = "set dpdk interface hqos placement <interface> thread <n>", + .function = set_dpdk_if_hqos_placement, +}; +/* *INDENT-ON* */ + +static clib_error_t * +set_dpdk_if_hqos_pipe (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + dpdk_main_t *dm = &dpdk_main; + vnet_hw_interface_t *hw; + dpdk_device_t *xd; + u32 hw_if_index = (u32) ~ 0; + u32 subport_id = (u32) ~ 0; + u32 pipe_id = (u32) ~ 0; + u32 profile_id = (u32) ~ 0; + int rv; + clib_error_t *error = NULL; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat + (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main, + &hw_if_index)) + ; + else if (unformat (line_input, "subport %d", &subport_id)) + ; + else if (unformat (line_input, "pipe %d", &pipe_id)) + ; + else if (unformat (line_input, "profile %d", &profile_id)) + ; + else + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } + } + + if (hw_if_index == (u32) ~ 0) + { + error = clib_error_return (0, "please specify valid interface name"); + goto done; + } + + hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index); + xd = vec_elt_at_index (dm->devices, hw->dev_instance); + + rv = + rte_sched_pipe_config (xd->hqos_ht->hqos, subport_id, pipe_id, + profile_id); + if (rv) + { + error = clib_error_return (0, "pipe configuration failed"); + goto done; + } + +done: + unformat_free (line_input); + + return error; +} + +/*? + * This command is used to change the profile associate with a HQoS pipe. The + * '<em><profile_id></em>' is zero based. Use the command + * '<em>show dpdk interface hqos</em>' to display the content of each profile. + * See @ref qos_doc for more details. + * + * @note + * Currently there is not an API to create a new HQoS pipe profile. One is + * created by default in the code (search for '<em>hqos_pipe_params_default</em>''). + * Additional profiles can be created in code and code recompiled. Then use this + * command to assign it. + * + * @cliexpar + * Example of how to assign a new profile to a HQoS pipe: + * @cliexcmd{set dpdk interface hqos pipe GigabitEthernet0/8/0 subport 0 pipe 2 profile 1} +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_set_dpdk_if_hqos_pipe, static) = +{ + .path = "set dpdk interface hqos pipe", + .short_help = "set dpdk interface hqos pipe <interface> subport <subport_id> pipe <pipe_id> " + "profile <profile_id>", + .function = set_dpdk_if_hqos_pipe, +}; +/* *INDENT-ON* */ + +static clib_error_t * +set_dpdk_if_hqos_subport (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + dpdk_main_t *dm = &dpdk_main; + dpdk_device_t *xd = NULL; + u32 hw_if_index = (u32) ~ 0; + u32 subport_id = (u32) ~ 0; + struct rte_sched_subport_params p; + int rv; + clib_error_t *error = NULL; + u32 tb_rate = (u32) ~ 0; + u32 tb_size = (u32) ~ 0; + u32 tc_rate[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE] = + { (u32) ~ 0, (u32) ~ 0, (u32) ~ 0, (u32) ~ 0 }; + u32 tc_period = (u32) ~ 0; + dpdk_device_config_t *devconf = NULL; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat + (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main, + &hw_if_index)) + ; + else if (unformat (line_input, "subport %d", &subport_id)) + ; + else if (unformat (line_input, "rate %d", &tb_rate)) + ; + else if (unformat (line_input, "bktsize %d", &tb_size)) + ; + else if (unformat (line_input, "tc0 %d", &tc_rate[0])) + ; + else if (unformat (line_input, "tc1 %d", &tc_rate[1])) + ; + else if (unformat (line_input, "tc2 %d", &tc_rate[2])) + ; + else if (unformat (line_input, "tc3 %d", &tc_rate[3])) + ; + else if (unformat (line_input, "period %d", &tc_period)) + ; + else + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } + } + + error = get_hqos (hw_if_index, subport_id, &xd, &devconf); + + if (error == NULL) + { + /* Copy the current values over to local structure. */ + memcpy (&p, &devconf->hqos.subport[subport_id], sizeof (p)); + + /* Update local structure with input values. */ + if (tb_rate != (u32) ~ 0) + { + p.tb_rate = tb_rate; + p.tc_rate[0] = tb_rate; + p.tc_rate[1] = tb_rate; + p.tc_rate[2] = tb_rate; + p.tc_rate[3] = tb_rate; + } + if (tb_size != (u32) ~ 0) + { + p.tb_size = tb_size; + } + if (tc_rate[0] != (u32) ~ 0) + { + p.tc_rate[0] = tc_rate[0]; + } + if (tc_rate[1] != (u32) ~ 0) + { + p.tc_rate[1] = tc_rate[1]; + } + if (tc_rate[2] != (u32) ~ 0) + { + p.tc_rate[2] = tc_rate[2]; + } + if (tc_rate[3] != (u32) ~ 0) + { + p.tc_rate[3] = tc_rate[3]; + } + if (tc_period != (u32) ~ 0) + { + p.tc_period = tc_period; + } + + /* Apply changes. */ + rv = rte_sched_subport_config (xd->hqos_ht->hqos, subport_id, &p); + if (rv) + { + error = clib_error_return (0, "subport configuration failed"); + goto done; + } + else + { + /* Successfully applied, so save of the input values. */ + memcpy (&devconf->hqos.subport[subport_id], &p, sizeof (p)); + } + } + +done: + unformat_free (line_input); + + return error; +} + +/*? + * This command is used to set the subport level parameters such as token + * bucket rate (bytes per seconds), token bucket size (bytes), traffic class + * rates (bytes per seconds) and token update period (Milliseconds). + * + * By default, the '<em>rate</em>' is set to 1250000000 bytes/second (10GbE + * rate) and each of the four traffic classes is set to 100% of the port rate. + * If the '<em>rate</em>' is updated by this command, all four traffic classes + * are assigned the same value. Each of the four traffic classes can be updated + * individually. + * + * @cliexpar + * Example of how modify the subport attributes for a 1GbE link: + * @cliexcmd{set dpdk interface hqos subport GigabitEthernet0/8/0 subport 0 rate 125000000} +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_set_dpdk_if_hqos_subport, static) = { + .path = "set dpdk interface hqos subport", + .short_help = "set dpdk interface hqos subport <interface> subport <subport_id> " + "[rate <n>] [bktsize <n>] [tc0 <n>] [tc1 <n>] [tc2 <n>] [tc3 <n>] " + "[period <n>]", + .function = set_dpdk_if_hqos_subport, +}; +/* *INDENT-ON* */ + +static clib_error_t * +set_dpdk_if_hqos_tctbl (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + vlib_thread_main_t *tm = vlib_get_thread_main (); + dpdk_main_t *dm = &dpdk_main; + vnet_hw_interface_t *hw; + dpdk_device_t *xd; + u32 hw_if_index = (u32) ~ 0; + u32 tc = (u32) ~ 0; + u32 queue = (u32) ~ 0; + u32 entry = (u32) ~ 0; + u32 val, i; + clib_error_t *error = NULL; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat + (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main, + &hw_if_index)) + ; + else if (unformat (line_input, "entry %d", &entry)) + ; + else if (unformat (line_input, "tc %d", &tc)) + ; + else if (unformat (line_input, "queue %d", &queue)) + ; + else + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } + } + + if (hw_if_index == (u32) ~ 0) + { + error = clib_error_return (0, "please specify valid interface name"); + goto done; + } + if (entry >= 64) + { + error = clib_error_return (0, "invalid entry"); + goto done; + } + if (tc >= RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE) + { + error = clib_error_return (0, "invalid traffic class"); + goto done; + } + if (queue >= RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS) + { + error = clib_error_return (0, "invalid traffic class queue"); + goto done; + } + + hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index); + xd = vec_elt_at_index (dm->devices, hw->dev_instance); + + /* Detect the set of worker threads */ + uword *p = hash_get_mem (tm->thread_registrations_by_name, "workers"); + /* Should never happen, shut up Coverity warning */ + if (p == 0) + { + error = clib_error_return (0, "no worker registrations?"); + goto done; + } + + vlib_thread_registration_t *tr = (vlib_thread_registration_t *) p[0]; + int worker_thread_first = tr->first_index; + int worker_thread_count = tr->count; + + val = tc * RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS + queue; + for (i = 0; i < worker_thread_count; i++) + xd->hqos_wt[worker_thread_first + i].hqos_tc_table[entry] = val; + +done: + unformat_free (line_input); + + return error; +} + +/*? + * This command is used to set the traffic class translation table. The + * traffic class translation table is used to map 64 values (0-63) to one of + * four traffic class and one of four HQoS input queue. Use the '<em>show + * dpdk interface hqos</em>' command to display the traffic class translation + * table. See @ref qos_doc for more details. + * + * This command has the following parameters: + * + * - <b><interface></b> - Used to specify the output interface. + * + * - <b>entry <map_val></b> - Mapped value (0-63) to assign traffic class and queue to. + * + * - <b>tc <tc_id></b> - Traffic class (0-3) to be used by the provided mapped value. + * + * - <b>queue <queue_id></b> - HQoS input queue (0-3) to be used by the provided mapped value. + * + * @cliexpar + * Example of how modify the traffic class translation table: + * @cliexcmd{set dpdk interface hqos tctbl GigabitEthernet0/8/0 entry 16 tc 2 queue 2} +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_set_dpdk_if_hqos_tctbl, static) = { + .path = "set dpdk interface hqos tctbl", + .short_help = "set dpdk interface hqos tctbl <interface> entry <map_val> tc <tc_id> queue <queue_id>", + .function = set_dpdk_if_hqos_tctbl, +}; +/* *INDENT-ON* */ + +static clib_error_t * +set_dpdk_if_hqos_pktfield (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + vlib_thread_main_t *tm = vlib_get_thread_main (); + dpdk_main_t *dm = &dpdk_main; + clib_error_t *error = NULL; + + /* Device specific data */ + struct rte_eth_dev_info dev_info; + dpdk_device_config_t *devconf = 0; + vnet_hw_interface_t *hw; + dpdk_device_t *xd; + u32 hw_if_index = (u32) ~ 0; + + /* Detect the set of worker threads */ + uword *p = hash_get_mem (tm->thread_registrations_by_name, "workers"); + /* Should never happen, shut up Coverity warning */ + if (p == 0) + return clib_error_return (0, "no worker registrations?"); + + vlib_thread_registration_t *tr = (vlib_thread_registration_t *) p[0]; + int worker_thread_first = tr->first_index; + int worker_thread_count = tr->count; + + /* Packet field configuration */ + u64 mask = (u64) ~ 0; + u32 id = (u32) ~ 0; + u32 offset = (u32) ~ 0; + + /* HQoS params */ + u32 n_subports_per_port, n_pipes_per_subport, tctbl_size; + + u32 i; + + /* Parse input arguments */ + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat + (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main, + &hw_if_index)) + ; + else if (unformat (line_input, "id subport")) + id = 0; + else if (unformat (line_input, "id pipe")) + id = 1; + else if (unformat (line_input, "id tc")) + id = 2; + else if (unformat (line_input, "id %d", &id)) + ; + else if (unformat (line_input, "offset %d", &offset)) + ; + else if (unformat (line_input, "mask %llx", &mask)) + ; + else + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } + } + + /* Get interface */ + if (hw_if_index == (u32) ~ 0) + { + error = clib_error_return (0, "please specify valid interface name"); + goto done; + } + + hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index); + xd = vec_elt_at_index (dm->devices, hw->dev_instance); + + rte_eth_dev_info_get (xd->device_index, &dev_info); + if (dev_info.pci_dev) + { /* bonded interface has no pci info */ + vlib_pci_addr_t pci_addr; + + pci_addr.domain = dev_info.pci_dev->addr.domain; + pci_addr.bus = dev_info.pci_dev->addr.bus; + pci_addr.slot = dev_info.pci_dev->addr.devid; + pci_addr.function = dev_info.pci_dev->addr.function; + + p = + hash_get (dm->conf->device_config_index_by_pci_addr, pci_addr.as_u32); + } + + if (p) + devconf = pool_elt_at_index (dm->conf->dev_confs, p[0]); + else + devconf = &dm->conf->default_devconf; + + if (devconf->hqos_enabled == 0) + { + vlib_cli_output (vm, "HQoS disabled for this interface"); + goto done; + } + + n_subports_per_port = devconf->hqos.port.n_subports_per_port; + n_pipes_per_subport = devconf->hqos.port.n_pipes_per_subport; + tctbl_size = RTE_DIM (devconf->hqos.tc_table); + + /* Validate packet field configuration: id, offset and mask */ + if (id >= 3) + { + error = clib_error_return (0, "invalid packet field id"); + goto done; + } + + switch (id) + { + case 0: + if (dpdk_hqos_validate_mask (mask, n_subports_per_port) != 0) + { + error = clib_error_return (0, "invalid subport ID mask " + "(n_subports_per_port = %u)", + n_subports_per_port); + goto done; + } + break; + case 1: + if (dpdk_hqos_validate_mask (mask, n_pipes_per_subport) != 0) + { + error = clib_error_return (0, "invalid pipe ID mask " + "(n_pipes_per_subport = %u)", + n_pipes_per_subport); + goto done; + } + break; + case 2: + default: + if (dpdk_hqos_validate_mask (mask, tctbl_size) != 0) + { + error = clib_error_return (0, "invalid TC table index mask " + "(TC table size = %u)", tctbl_size); + goto done; + } + } + + /* Propagate packet field configuration to all workers */ + for (i = 0; i < worker_thread_count; i++) + switch (id) + { + case 0: + xd->hqos_wt[worker_thread_first + i].hqos_field0_slabpos = offset; + xd->hqos_wt[worker_thread_first + i].hqos_field0_slabmask = mask; + xd->hqos_wt[worker_thread_first + i].hqos_field0_slabshr = + __builtin_ctzll (mask); + break; + case 1: + xd->hqos_wt[worker_thread_first + i].hqos_field1_slabpos = offset; + xd->hqos_wt[worker_thread_first + i].hqos_field1_slabmask = mask; + xd->hqos_wt[worker_thread_first + i].hqos_field1_slabshr = + __builtin_ctzll (mask); + break; + case 2: + default: + xd->hqos_wt[worker_thread_first + i].hqos_field2_slabpos = offset; + xd->hqos_wt[worker_thread_first + i].hqos_field2_slabmask = mask; + xd->hqos_wt[worker_thread_first + i].hqos_field2_slabshr = + __builtin_ctzll (mask); + } + +done: + unformat_free (line_input); + + return error; +} + +/*? + * This command is used to set the packet fields required for classifiying the + * incoming packet. As a result of classification process, packet field + * information will be mapped to 5 tuples (subport, pipe, traffic class, pipe, + * color) and stored in packet mbuf. + * + * This command has the following parameters: + * + * - <b><interface></b> - Used to specify the output interface. + * + * - <b>id subport|pipe|tc</b> - Classification occurs across three fields. + * This parameter indicates which of the three masks are being configured. Legacy + * code used 0-2 to represent these three fields, so 0-2 is still accepted. + * - <b>subport|0</b> - Currently only one subport is supported, so only + * an empty mask is supported for the subport classification. + * - <b>pipe|1</b> - Currently, 4096 pipes per subport are supported, so a + * 12-bit mask should be configure to map to the 0-4095 pipes. + * - <b>tc|2</b> - The translation table (see '<em>set dpdk interface hqos + * tctbl</em>' command) maps each value (0-63) into one of the 4 traffic classes + * per pipe. A 6-bit mask should be configure to map this field to a traffic class. + * + * - <b>offset <n></b> - Offset in the packet to apply the 64-bit mask for classification. + * The offset should be on an 8-byte boundary (0,8,16,24..). + * + * - <b>mask <hex-mask></b> - 64-bit mask to apply to packet at the given '<em>offset</em>'. + * Bits must be contiguous and should not include '<em>0x</em>'. + * + * The default values for the '<em>pktfield</em>' assumes Ethernet/IPv4/UDP packets with + * no VLAN. Adjust based on expected packet format and desired classification field. + * - '<em>subport</em>' is always empty (offset 0 mask 0000000000000000) + * - By default, '<em>pipe</em>' maps to the UDP payload bits 12 .. 23 (offset 40 + * mask 0000000fff000000) + * - By default, '<em>tc</em>' maps to the DSCP field in IP header (offset 48 mask + * 00000000000000fc) + * + * @cliexpar + * Example of how modify the '<em>pipe</em>' classification filter to match VLAN: + * @cliexcmd{set dpdk interface hqos pktfield GigabitEthernet0/8/0 id pipe offset 8 mask 0000000000000FFF} +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_set_dpdk_if_hqos_pktfield, static) = { + .path = "set dpdk interface hqos pktfield", + .short_help = "set dpdk interface hqos pktfield <interface> id subport|pipe|tc offset <n> " + "mask <hex-mask>", + .function = set_dpdk_if_hqos_pktfield, +}; +/* *INDENT-ON* */ + +static clib_error_t * +show_dpdk_if_hqos (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + vlib_thread_main_t *tm = vlib_get_thread_main (); + dpdk_main_t *dm = &dpdk_main; + vnet_hw_interface_t *hw; + dpdk_device_t *xd; + dpdk_device_config_hqos_t *cfg; + dpdk_device_hqos_per_hqos_thread_t *ht; + dpdk_device_hqos_per_worker_thread_t *wk; + u32 *tctbl; + u32 hw_if_index = (u32) ~ 0; + u32 profile_id, subport_id, i; + struct rte_eth_dev_info dev_info; + dpdk_device_config_t *devconf = 0; + vlib_thread_registration_t *tr; + uword *p = 0; + clib_error_t *error = NULL; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat + (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main, + &hw_if_index)) + ; + else + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } + } + + if (hw_if_index == (u32) ~ 0) + { + error = clib_error_return (0, "please specify interface name!!"); + goto done; + } + + hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index); + xd = vec_elt_at_index (dm->devices, hw->dev_instance); + + rte_eth_dev_info_get (xd->device_index, &dev_info); + if (dev_info.pci_dev) + { /* bonded interface has no pci info */ + vlib_pci_addr_t pci_addr; + + pci_addr.domain = dev_info.pci_dev->addr.domain; + pci_addr.bus = dev_info.pci_dev->addr.bus; + pci_addr.slot = dev_info.pci_dev->addr.devid; + pci_addr.function = dev_info.pci_dev->addr.function; + + p = + hash_get (dm->conf->device_config_index_by_pci_addr, pci_addr.as_u32); + } + + if (p) + devconf = pool_elt_at_index (dm->conf->dev_confs, p[0]); + else + devconf = &dm->conf->default_devconf; + + if (devconf->hqos_enabled == 0) + { + vlib_cli_output (vm, "HQoS disabled for this interface"); + goto done; + } + + /* Detect the set of worker threads */ + p = hash_get_mem (tm->thread_registrations_by_name, "workers"); + + /* Should never happen, shut up Coverity warning */ + if (p == 0) + { + error = clib_error_return (0, "no worker registrations?"); + goto done; + } + + tr = (vlib_thread_registration_t *) p[0]; + + cfg = &devconf->hqos; + ht = xd->hqos_ht; + wk = &xd->hqos_wt[tr->first_index]; + tctbl = wk->hqos_tc_table; + + vlib_cli_output (vm, " Thread:"); + vlib_cli_output (vm, " Input SWQ size = %u packets", cfg->swq_size); + vlib_cli_output (vm, " Enqueue burst size = %u packets", + ht->hqos_burst_enq); + vlib_cli_output (vm, " Dequeue burst size = %u packets", + ht->hqos_burst_deq); + + vlib_cli_output (vm, + " Packet field 0: slab position = %4u, slab bitmask = 0x%016llx (subport)", + wk->hqos_field0_slabpos, wk->hqos_field0_slabmask); + vlib_cli_output (vm, + " Packet field 1: slab position = %4u, slab bitmask = 0x%016llx (pipe)", + wk->hqos_field1_slabpos, wk->hqos_field1_slabmask); + vlib_cli_output (vm, + " Packet field 2: slab position = %4u, slab bitmask = 0x%016llx (tc)", + wk->hqos_field2_slabpos, wk->hqos_field2_slabmask); + vlib_cli_output (vm, + " Packet field 2 tc translation table: ([Mapped Value Range]: tc/queue tc/queue ...)"); + vlib_cli_output (vm, + " [ 0 .. 15]: " + "%u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u", + tctbl[0] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[0] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[1] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[1] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[2] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[2] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[3] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[3] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[4] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[4] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[5] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[5] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[6] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[6] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[7] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[7] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[8] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[8] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[9] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[9] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[10] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[10] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[11] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[11] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[12] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[12] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[13] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[13] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[14] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[14] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[15] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[15] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS); + vlib_cli_output (vm, + " [16 .. 31]: " + "%u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u", + tctbl[16] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[16] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[17] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[17] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[18] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[18] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[19] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[19] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[20] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[20] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[21] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[21] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[22] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[22] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[23] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[23] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[24] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[24] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[25] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[25] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[26] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[26] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[27] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[27] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[28] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[28] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[29] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[29] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[30] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[30] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[31] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[31] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS); + vlib_cli_output (vm, + " [32 .. 47]: " + "%u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u", + tctbl[32] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[32] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[33] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[33] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[34] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[34] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[35] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[35] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[36] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[36] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[37] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[37] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[38] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[38] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[39] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[39] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[40] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[40] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[41] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[41] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[42] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[42] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[43] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[43] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[44] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[44] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[45] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[45] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[46] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[46] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[47] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[47] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS); + vlib_cli_output (vm, + " [48 .. 63]: " + "%u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u %u/%u", + tctbl[48] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[48] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[49] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[49] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[50] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[50] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[51] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[51] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[52] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[52] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[53] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[53] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[54] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[54] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[55] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[55] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[56] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[56] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[57] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[57] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[58] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[58] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[59] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[59] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[60] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[60] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[61] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[61] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[62] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[62] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[63] / RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS, + tctbl[63] % RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS); + vlib_cli_output (vm, " Port:"); + vlib_cli_output (vm, " Rate = %u bytes/second", cfg->port.rate); + vlib_cli_output (vm, " MTU = %u bytes", cfg->port.mtu); + vlib_cli_output (vm, " Frame overhead = %u bytes", + cfg->port.frame_overhead); + vlib_cli_output (vm, " Number of subports = %u", + cfg->port.n_subports_per_port); + vlib_cli_output (vm, " Number of pipes per subport = %u", + cfg->port.n_pipes_per_subport); + vlib_cli_output (vm, + " Packet queue size: TC0 = %u, TC1 = %u, TC2 = %u, TC3 = %u packets", + cfg->port.qsize[0], cfg->port.qsize[1], cfg->port.qsize[2], + cfg->port.qsize[3]); + vlib_cli_output (vm, " Number of pipe profiles = %u", + cfg->port.n_pipe_profiles); + + for (subport_id = 0; subport_id < vec_len (cfg->subport); subport_id++) + { + vlib_cli_output (vm, " Subport %u:", subport_id); + vlib_cli_output (vm, " Rate = %u bytes/second", + cfg->subport[subport_id].tb_rate); + vlib_cli_output (vm, " Token bucket size = %u bytes", + cfg->subport[subport_id].tb_size); + vlib_cli_output (vm, + " Traffic class rate: TC0 = %u, TC1 = %u, TC2 = %u, TC3 = %u bytes/second", + cfg->subport[subport_id].tc_rate[0], + cfg->subport[subport_id].tc_rate[1], + cfg->subport[subport_id].tc_rate[2], + cfg->subport[subport_id].tc_rate[3]); + vlib_cli_output (vm, " TC period = %u milliseconds", + cfg->subport[subport_id].tc_period); + } + + for (profile_id = 0; profile_id < vec_len (cfg->pipe); profile_id++) + { + vlib_cli_output (vm, " Pipe profile %u:", profile_id); + vlib_cli_output (vm, " Rate = %u bytes/second", + cfg->pipe[profile_id].tb_rate); + vlib_cli_output (vm, " Token bucket size = %u bytes", + cfg->pipe[profile_id].tb_size); + vlib_cli_output (vm, + " Traffic class rate: TC0 = %u, TC1 = %u, TC2 = %u, TC3 = %u bytes/second", + cfg->pipe[profile_id].tc_rate[0], + cfg->pipe[profile_id].tc_rate[1], + cfg->pipe[profile_id].tc_rate[2], + cfg->pipe[profile_id].tc_rate[3]); + vlib_cli_output (vm, " TC period = %u milliseconds", + cfg->pipe[profile_id].tc_period); +#ifdef RTE_SCHED_SUBPORT_TC_OV + vlib_cli_output (vm, " TC3 oversubscription_weight = %u", + cfg->pipe[profile_id].tc_ov_weight); +#endif + + for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) + { + vlib_cli_output (vm, + " TC%u WRR weights: Q0 = %u, Q1 = %u, Q2 = %u, Q3 = %u", + i, cfg->pipe[profile_id].wrr_weights[i * 4], + cfg->pipe[profile_id].wrr_weights[i * 4 + 1], + cfg->pipe[profile_id].wrr_weights[i * 4 + 2], + cfg->pipe[profile_id].wrr_weights[i * 4 + 3]); + } + } + +#ifdef RTE_SCHED_RED + vlib_cli_output (vm, " Weighted Random Early Detection (WRED):"); + for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) + { + vlib_cli_output (vm, " TC%u min: G = %u, Y = %u, R = %u", i, + cfg->port.red_params[i][e_RTE_METER_GREEN].min_th, + cfg->port.red_params[i][e_RTE_METER_YELLOW].min_th, + cfg->port.red_params[i][e_RTE_METER_RED].min_th); + + vlib_cli_output (vm, " TC%u max: G = %u, Y = %u, R = %u", i, + cfg->port.red_params[i][e_RTE_METER_GREEN].max_th, + cfg->port.red_params[i][e_RTE_METER_YELLOW].max_th, + cfg->port.red_params[i][e_RTE_METER_RED].max_th); + + vlib_cli_output (vm, + " TC%u inverted probability: G = %u, Y = %u, R = %u", + i, cfg->port.red_params[i][e_RTE_METER_GREEN].maxp_inv, + cfg->port.red_params[i][e_RTE_METER_YELLOW].maxp_inv, + cfg->port.red_params[i][e_RTE_METER_RED].maxp_inv); + + vlib_cli_output (vm, " TC%u weight: R = %u, Y = %u, R = %u", i, + cfg->port.red_params[i][e_RTE_METER_GREEN].wq_log2, + cfg->port.red_params[i][e_RTE_METER_YELLOW].wq_log2, + cfg->port.red_params[i][e_RTE_METER_RED].wq_log2); + } +#endif + +done: + unformat_free (line_input); + + return error; +} + +/*? + * This command is used to display details of an output interface's HQoS + * settings. + * + * @cliexpar + * Example of how to display HQoS settings for an interfaces: + * @cliexstart{show dpdk interface hqos GigabitEthernet0/8/0} + * Thread: + * Input SWQ size = 4096 packets + * Enqueue burst size = 256 packets + * Dequeue burst size = 220 packets + * Packet field 0: slab position = 0, slab bitmask = 0x0000000000000000 (subport) + * Packet field 1: slab position = 40, slab bitmask = 0x0000000fff000000 (pipe) + * Packet field 2: slab position = 8, slab bitmask = 0x00000000000000fc (tc) + * Packet field 2 tc translation table: ([Mapped Value Range]: tc/queue tc/queue ...) + * [ 0 .. 15]: 0/0 0/1 0/2 0/3 1/0 1/1 1/2 1/3 2/0 2/1 2/2 2/3 3/0 3/1 3/2 3/3 + * [16 .. 31]: 0/0 0/1 0/2 0/3 1/0 1/1 1/2 1/3 2/0 2/1 2/2 2/3 3/0 3/1 3/2 3/3 + * [32 .. 47]: 0/0 0/1 0/2 0/3 1/0 1/1 1/2 1/3 2/0 2/1 2/2 2/3 3/0 3/1 3/2 3/3 + * [48 .. 63]: 0/0 0/1 0/2 0/3 1/0 1/1 1/2 1/3 2/0 2/1 2/2 2/3 3/0 3/1 3/2 3/3 + * Port: + * Rate = 1250000000 bytes/second + * MTU = 1514 bytes + * Frame overhead = 24 bytes + * Number of subports = 1 + * Number of pipes per subport = 4096 + * Packet queue size: TC0 = 64, TC1 = 64, TC2 = 64, TC3 = 64 packets + * Number of pipe profiles = 2 + * Subport 0: + * Rate = 1250000000 bytes/second + * Token bucket size = 1000000 bytes + * Traffic class rate: TC0 = 1250000000, TC1 = 1250000000, TC2 = 1250000000, TC3 = 1250000000 bytes/second + * TC period = 10 milliseconds + * Pipe profile 0: + * Rate = 305175 bytes/second + * Token bucket size = 1000000 bytes + * Traffic class rate: TC0 = 305175, TC1 = 305175, TC2 = 305175, TC3 = 305175 bytes/second + * TC period = 40 milliseconds + * TC0 WRR weights: Q0 = 1, Q1 = 1, Q2 = 1, Q3 = 1 + * TC1 WRR weights: Q0 = 1, Q1 = 1, Q2 = 1, Q3 = 1 + * TC2 WRR weights: Q0 = 1, Q1 = 1, Q2 = 1, Q3 = 1 + * TC3 WRR weights: Q0 = 1, Q1 = 1, Q2 = 1, Q3 = 1 + * @cliexend +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_show_dpdk_if_hqos, static) = { + .path = "show dpdk interface hqos", + .short_help = "show dpdk interface hqos <interface>", + .function = show_dpdk_if_hqos, +}; + +/* *INDENT-ON* */ + +static clib_error_t * +show_dpdk_hqos_queue_stats (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + clib_error_t *error = NULL; +#ifdef RTE_SCHED_COLLECT_STATS + dpdk_main_t *dm = &dpdk_main; + u32 hw_if_index = (u32) ~ 0; + u32 subport = (u32) ~ 0; + u32 pipe = (u32) ~ 0; + u32 tc = (u32) ~ 0; + u32 tc_q = (u32) ~ 0; + vnet_hw_interface_t *hw; + dpdk_device_t *xd; + uword *p = 0; + struct rte_eth_dev_info dev_info; + dpdk_device_config_t *devconf = 0; + u32 qindex; + struct rte_sched_queue_stats stats; + u16 qlen; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat + (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main, + &hw_if_index)) + ; + + else if (unformat (line_input, "subport %d", &subport)) + ; + + else if (unformat (line_input, "pipe %d", &pipe)) + ; + + else if (unformat (line_input, "tc %d", &tc)) + ; + + else if (unformat (line_input, "tc_q %d", &tc_q)) + ; + + else + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } + } + + if (hw_if_index == (u32) ~ 0) + { + error = clib_error_return (0, "please specify interface name!!"); + goto done; + } + + hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index); + xd = vec_elt_at_index (dm->devices, hw->dev_instance); + + rte_eth_dev_info_get (xd->device_index, &dev_info); + if (dev_info.pci_dev) + { /* bonded interface has no pci info */ + vlib_pci_addr_t pci_addr; + + pci_addr.domain = dev_info.pci_dev->addr.domain; + pci_addr.bus = dev_info.pci_dev->addr.bus; + pci_addr.slot = dev_info.pci_dev->addr.devid; + pci_addr.function = dev_info.pci_dev->addr.function; + + p = + hash_get (dm->conf->device_config_index_by_pci_addr, pci_addr.as_u32); + } + + if (p) + devconf = pool_elt_at_index (dm->conf->dev_confs, p[0]); + else + devconf = &dm->conf->default_devconf; + + if (devconf->hqos_enabled == 0) + { + vlib_cli_output (vm, "HQoS disabled for this interface"); + goto done; + } + + /* + * Figure out which queue to query. cf rte_sched_port_qindex. (Not sure why + * that method isn't made public by DPDK - how _should_ we get the queue ID?) + */ + qindex = subport * devconf->hqos.port.n_pipes_per_subport + pipe; + qindex = qindex * RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE + tc; + qindex = qindex * RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS + tc_q; + + if (rte_sched_queue_read_stats (xd->hqos_ht->hqos, qindex, &stats, &qlen) != + 0) + { + error = clib_error_return (0, "failed to read stats"); + goto done; + } + + vlib_cli_output (vm, "%=24s%=16s", "Stats Parameter", "Value"); + vlib_cli_output (vm, "%=24s%=16d", "Packets", stats.n_pkts); + vlib_cli_output (vm, "%=24s%=16d", "Packets dropped", stats.n_pkts_dropped); +#ifdef RTE_SCHED_RED + vlib_cli_output (vm, "%=24s%=16d", "Packets dropped (RED)", + stats.n_pkts_red_dropped); +#endif + vlib_cli_output (vm, "%=24s%=16d", "Bytes", stats.n_bytes); + vlib_cli_output (vm, "%=24s%=16d", "Bytes dropped", stats.n_bytes_dropped); + +#else + + /* Get a line of input */ + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + vlib_cli_output (vm, "RTE_SCHED_COLLECT_STATS disabled in DPDK"); + goto done; + +#endif + +done: + unformat_free (line_input); + + return error; +} + +/*? + * This command is used to display statistics associated with a HQoS traffic class + * queue. + * + * @note + * Statistic collection by the scheduler is disabled by default in DPDK. In order to + * turn it on, add the following line to '<em>../vpp/dpdk/Makefile</em>': + * - <b>$(call set,RTE_SCHED_COLLECT_STATS,y)</b> + * + * @cliexpar + * Example of how to display statistics of HQoS a HQoS traffic class queue: + * @cliexstart{show dpdk hqos queue GigabitEthernet0/9/0 subport 0 pipe 3181 tc 0 tc_q 0} + * Stats Parameter Value + * Packets 140 + * Packets dropped 0 + * Bytes 8400 + * Bytes dropped 0 + * @cliexend +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_show_dpdk_hqos_queue_stats, static) = { + .path = "show dpdk hqos queue", + .short_help = "show dpdk hqos queue <interface> subport <subport_id> pipe <pipe_id> tc <tc_id> tc_q <queue_id>", + .function = show_dpdk_hqos_queue_stats, +}; +/* *INDENT-ON* */ + +static clib_error_t * +show_dpdk_version_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ +#define _(a,b,c) vlib_cli_output (vm, "%-25s " b, a ":", c); + _("DPDK Version", "%s", rte_version ()); + _("DPDK EAL init args", "%s", dpdk_config_main.eal_init_args_str); +#undef _ + return 0; +} + +/*? + * This command is used to display the current DPDK version and + * the list of arguments passed to DPDK when started. + * + * @cliexpar + * Example of how to display how many DPDK buffer test command has allcoated: + * @cliexstart{show dpdk version} + * DPDK Version: DPDK 16.11.0 + * DPDK EAL init args: -c 1 -n 4 --huge-dir /run/vpp/hugepages --file-prefix vpp -w 0000:00:08.0 -w 0000:00:09.0 --master-lcore 0 --socket-mem 256 + * @cliexend +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_vpe_version_command, static) = { + .path = "show dpdk version", + .short_help = "show dpdk version", + .function = show_dpdk_version_command_fn, +}; +/* *INDENT-ON* */ + +#if CLI_DEBUG + +static clib_error_t * +dpdk_validate_buffers_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd_arg) +{ + u32 n_invalid_bufs = 0, uninitialized = 0; + u32 is_poison = 0, is_test = 0; + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "poison")) + is_poison = 1; + else if (unformat (input, "trajectory")) + is_test = 1; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + + if (VLIB_BUFFER_TRACE_TRAJECTORY == 0) + { + vlib_cli_output (vm, "Trajectory not enabled. Recompile with " + "VLIB_BUFFER_TRACE_TRAJECTORY 1"); + return 0; + } + if (is_poison) + { + dpdk_buffer_poison_trajectory_all (); + } + if (is_test) + { + n_invalid_bufs = dpdk_buffer_validate_trajectory_all (&uninitialized); + if (!n_invalid_bufs) + vlib_cli_output (vm, "All buffers are valid %d uninitialized", + uninitialized); + else + vlib_cli_output (vm, "Found %d invalid buffers and %d uninitialized", + n_invalid_bufs, uninitialized); + } + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (test_dpdk_buffers_command, static) = +{ + .path = "test dpdk buffers", + .short_help = "test dpdk buffers [poison] [trajectory]", + .function = dpdk_validate_buffers_fn, +}; +/* *INDENT-ON* */ + +#endif + +clib_error_t * +dpdk_cli_init (vlib_main_t * vm) +{ + return 0; +} + +VLIB_INIT_FUNCTION (dpdk_cli_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/dpdk/device/common.c b/src/plugins/dpdk/device/common.c new file mode 100644 index 00000000..aedc3f52 --- /dev/null +++ b/src/plugins/dpdk/device/common.c @@ -0,0 +1,315 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vnet/vnet.h> +#include <vppinfra/vec.h> +#include <vppinfra/format.h> +#include <vlib/unix/cj.h> +#include <assert.h> + +#include <vnet/ip/ip.h> +#include <vnet/ethernet/ethernet.h> +#include <vnet/ethernet/arp_packet.h> +#include <dpdk/device/dpdk.h> + +#include <dpdk/device/dpdk_priv.h> +#include <vppinfra/error.h> + +void +dpdk_device_error (dpdk_device_t * xd, char *str, int rv) +{ + xd->errors = clib_error_return (xd->errors, "%s[port:%d, errno:%d]: %s", + str, xd->device_index, rv, + rte_strerror (rv)); +} + +void +dpdk_device_setup (dpdk_device_t * xd) +{ + dpdk_main_t *dm = &dpdk_main; + vnet_main_t *vnm = vnet_get_main (); + vnet_sw_interface_t *sw = vnet_get_sw_interface (vnm, xd->vlib_sw_if_index); + int rv; + int j; + + ASSERT (vlib_get_thread_index () == 0); + + clib_error_free (xd->errors); + sw->flags &= ~VNET_SW_INTERFACE_FLAG_ERROR; + + if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) + { + vnet_hw_interface_set_flags (dm->vnet_main, xd->hw_if_index, 0); + dpdk_device_stop (xd); + } + + rv = rte_eth_dev_configure (xd->device_index, xd->rx_q_used, + xd->tx_q_used, &xd->port_conf); + + if (rv < 0) + { + dpdk_device_error (xd, "rte_eth_dev_configure", rv); + goto error; + } + + /* Set up one TX-queue per worker thread */ + for (j = 0; j < xd->tx_q_used; j++) + { + rv = rte_eth_tx_queue_setup (xd->device_index, j, xd->nb_tx_desc, + xd->cpu_socket, &xd->tx_conf); + + /* retry with any other CPU socket */ + if (rv < 0) + rv = rte_eth_tx_queue_setup (xd->device_index, j, xd->nb_tx_desc, + SOCKET_ID_ANY, &xd->tx_conf); + if (rv < 0) + dpdk_device_error (xd, "rte_eth_tx_queue_setup", rv); + } + + for (j = 0; j < xd->rx_q_used; j++) + { + uword tidx = vnet_get_device_input_thread_index (dm->vnet_main, + xd->hw_if_index, j); + unsigned lcore = vlib_worker_threads[tidx].lcore_id; + u16 socket_id = rte_lcore_to_socket_id (lcore); + + rv = rte_eth_rx_queue_setup (xd->device_index, j, xd->nb_rx_desc, + xd->cpu_socket, 0, + dm->pktmbuf_pools[socket_id]); + + /* retry with any other CPU socket */ + if (rv < 0) + rv = rte_eth_rx_queue_setup (xd->device_index, j, xd->nb_rx_desc, + SOCKET_ID_ANY, 0, + dm->pktmbuf_pools[socket_id]); + + if (rv < 0) + dpdk_device_error (xd, "rte_eth_rx_queue_setup", rv); + } + + if (vec_len (xd->errors)) + goto error; + + if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) + dpdk_device_start (xd); + + if (vec_len (xd->errors)) + goto error; + + return; + +error: + xd->flags |= DPDK_DEVICE_FLAG_PMD_INIT_FAIL; + sw->flags |= VNET_SW_INTERFACE_FLAG_ERROR; +} + +void +dpdk_device_start (dpdk_device_t * xd) +{ + int rv; + + if (xd->flags & DPDK_DEVICE_FLAG_PMD_INIT_FAIL) + return; + + rv = rte_eth_dev_start (xd->device_index); + + if (rv) + { + dpdk_device_error (xd, "rte_eth_dev_start", rv); + return; + } + + if (xd->default_mac_address) + rv = + rte_eth_dev_default_mac_addr_set (xd->device_index, + (struct ether_addr *) + xd->default_mac_address); + + if (rv) + dpdk_device_error (xd, "rte_eth_dev_default_mac_addr_set", rv); + + if (xd->flags & DPDK_DEVICE_FLAG_PROMISC) + rte_eth_promiscuous_enable (xd->device_index); + else + rte_eth_promiscuous_disable (xd->device_index); + + rte_eth_allmulticast_enable (xd->device_index); + + if (xd->pmd == VNET_DPDK_PMD_BOND) + { + u8 slink[16]; + int nlink = rte_eth_bond_slaves_get (xd->device_index, slink, 16); + while (nlink >= 1) + { + u8 dpdk_port = slink[--nlink]; + rte_eth_allmulticast_enable (dpdk_port); + } + } +} + +void +dpdk_device_stop (dpdk_device_t * xd) +{ + if (xd->flags & DPDK_DEVICE_FLAG_PMD_INIT_FAIL) + return; + + rte_eth_allmulticast_disable (xd->device_index); + rte_eth_dev_stop (xd->device_index); + + /* For bonded interface, stop slave links */ + if (xd->pmd == VNET_DPDK_PMD_BOND) + { + u8 slink[16]; + int nlink = rte_eth_bond_slaves_get (xd->device_index, slink, 16); + while (nlink >= 1) + { + u8 dpdk_port = slink[--nlink]; + rte_eth_dev_stop (dpdk_port); + } + } +} + +/* Even type for send_garp_na_process */ +enum +{ + SEND_GARP_NA = 1, +} dpdk_send_garp_na_process_event_t; + +static vlib_node_registration_t send_garp_na_proc_node; + +static uword +send_garp_na_process (vlib_main_t * vm, + vlib_node_runtime_t * rt, vlib_frame_t * f) +{ + vnet_main_t *vnm = vnet_get_main (); + uword event_type, *event_data = 0; + + while (1) + { + u32 i; + uword dpdk_port; + vlib_process_wait_for_event (vm); + event_type = vlib_process_get_events (vm, &event_data); + ASSERT (event_type == SEND_GARP_NA); + for (i = 0; i < vec_len (event_data); i++) + { + dpdk_port = event_data[i]; + if (i < 5) /* wait 0.2 sec for link to settle, max total 1 sec */ + vlib_process_suspend (vm, 0.2); + dpdk_device_t *xd = &dpdk_main.devices[dpdk_port]; + u32 hw_if_index = xd->hw_if_index; + vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, hw_if_index); + dpdk_update_link_state (xd, vlib_time_now (vm)); + send_ip4_garp (vm, hi); + send_ip6_na (vm, hi); + } + vec_reset_length (event_data); + } + return 0; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (send_garp_na_proc_node, static) = { + .function = send_garp_na_process, + .type = VLIB_NODE_TYPE_PROCESS, + .name = "send-garp-na-process", +}; +/* *INDENT-ON* */ + +void vl_api_force_rpc_call_main_thread (void *fp, u8 * data, u32 data_length); + +static void +garp_na_proc_callback (uword * dpdk_port) +{ + vlib_main_t *vm = vlib_get_main (); + ASSERT (vlib_get_thread_index () == 0); + vlib_process_signal_event + (vm, send_garp_na_proc_node.index, SEND_GARP_NA, *dpdk_port); +} + +always_inline int +dpdk_port_state_callback_inline (uint8_t port_id, + enum rte_eth_event_type type, void *param) +{ + struct rte_eth_link link; + dpdk_device_t *xd = &dpdk_main.devices[port_id]; + + RTE_SET_USED (param); + if (type != RTE_ETH_EVENT_INTR_LSC) + { + clib_warning ("Unknown event %d received for port %d", type, port_id); + return -1; + } + + rte_eth_link_get_nowait (port_id, &link); + u8 link_up = link.link_status; + + if (xd->flags & DPDK_DEVICE_FLAG_BOND_SLAVE) + { + uword bd_port = xd->bond_port; + int bd_mode = rte_eth_bond_mode_get (bd_port); +#if 0 + clib_warning ("Port %d state to %s, " + "slave of port %d BondEthernet%d in mode %d", + port_id, (link_up) ? "UP" : "DOWN", + bd_port, xd->port_id, bd_mode); +#endif + if (bd_mode == BONDING_MODE_ACTIVE_BACKUP) + { + vl_api_force_rpc_call_main_thread + (garp_na_proc_callback, (u8 *) & bd_port, sizeof (uword)); + } + xd->flags |= link_up ? + DPDK_DEVICE_FLAG_BOND_SLAVE_UP : ~DPDK_DEVICE_FLAG_BOND_SLAVE_UP; + } + else /* Should not happen as callback not setup for "normal" links */ + { + if (link_up) + clib_warning ("Port %d Link Up - speed %u Mbps - %s", + port_id, (unsigned) link.link_speed, + (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? + "full-duplex" : "half-duplex"); + else + clib_warning ("Port %d Link Down\n\n", port_id); + } + + return 0; +} + +#if DPDK_VOID_CALLBACK +void +dpdk_port_state_callback (uint8_t port_id, + enum rte_eth_event_type type, void *param) +{ + dpdk_port_state_callback_inline (port_id, type, param); +} + +#else +int +dpdk_port_state_callback (uint8_t port_id, + enum rte_eth_event_type type, + void *param, + void *ret_param __attribute__ ((unused))) +{ + return dpdk_port_state_callback_inline (port_id, type, param); +} +#endif +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/dpdk/device/device.c b/src/plugins/dpdk/device/device.c new file mode 100644 index 00000000..aa134327 --- /dev/null +++ b/src/plugins/dpdk/device/device.c @@ -0,0 +1,856 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vnet/vnet.h> +#include <vppinfra/vec.h> +#include <vppinfra/format.h> +#include <vlib/unix/cj.h> +#include <assert.h> + +#include <vnet/ethernet/ethernet.h> +#include <dpdk/device/dpdk.h> + +#include <dpdk/device/dpdk_priv.h> +#include <vppinfra/error.h> + +#define foreach_dpdk_tx_func_error \ + _(BAD_RETVAL, "DPDK tx function returned an error") \ + _(RING_FULL, "Tx packet drops (ring full)") \ + _(PKT_DROP, "Tx packet drops (dpdk tx failure)") \ + _(REPL_FAIL, "Tx packet drops (replication failure)") + +typedef enum +{ +#define _(f,s) DPDK_TX_FUNC_ERROR_##f, + foreach_dpdk_tx_func_error +#undef _ + DPDK_TX_FUNC_N_ERROR, +} dpdk_tx_func_error_t; + +static char *dpdk_tx_func_error_strings[] = { +#define _(n,s) s, + foreach_dpdk_tx_func_error +#undef _ +}; + +static clib_error_t * +dpdk_set_mac_address (vnet_hw_interface_t * hi, char *address) +{ + int error; + dpdk_main_t *dm = &dpdk_main; + dpdk_device_t *xd = vec_elt_at_index (dm->devices, hi->dev_instance); + + error = rte_eth_dev_default_mac_addr_set (xd->device_index, + (struct ether_addr *) address); + + if (error) + { + return clib_error_return (0, "mac address set failed: %d", error); + } + else + { + vec_reset_length (xd->default_mac_address); + vec_add (xd->default_mac_address, address, sizeof (address)); + return NULL; + } +} + +struct rte_mbuf * +dpdk_replicate_packet_mb (vlib_buffer_t * b) +{ + dpdk_main_t *dm = &dpdk_main; + struct rte_mbuf **mbufs = 0, *s, *d; + u8 nb_segs; + unsigned socket_id = rte_socket_id (); + int i; + + ASSERT (dm->pktmbuf_pools[socket_id]); + s = rte_mbuf_from_vlib_buffer (b); + nb_segs = s->nb_segs; + vec_validate (mbufs, nb_segs - 1); + + if (rte_pktmbuf_alloc_bulk (dm->pktmbuf_pools[socket_id], mbufs, nb_segs)) + { + vec_free (mbufs); + return 0; + } + + d = mbufs[0]; + d->nb_segs = s->nb_segs; + d->data_len = s->data_len; + d->pkt_len = s->pkt_len; + d->data_off = s->data_off; + clib_memcpy (d->buf_addr, s->buf_addr, RTE_PKTMBUF_HEADROOM + s->data_len); + + for (i = 1; i < nb_segs; i++) + { + d->next = mbufs[i]; + d = mbufs[i]; + s = s->next; + d->data_len = s->data_len; + clib_memcpy (d->buf_addr, s->buf_addr, + RTE_PKTMBUF_HEADROOM + s->data_len); + } + + d = mbufs[0]; + vec_free (mbufs); + return d; +} + +static void +dpdk_tx_trace_buffer (dpdk_main_t * dm, + vlib_node_runtime_t * node, + dpdk_device_t * xd, + u16 queue_id, u32 buffer_index, vlib_buffer_t * buffer) +{ + vlib_main_t *vm = vlib_get_main (); + dpdk_tx_dma_trace_t *t0; + struct rte_mbuf *mb; + + mb = rte_mbuf_from_vlib_buffer (buffer); + + t0 = vlib_add_trace (vm, node, buffer, sizeof (t0[0])); + t0->queue_index = queue_id; + t0->device_index = xd->device_index; + t0->buffer_index = buffer_index; + clib_memcpy (&t0->mb, mb, sizeof (t0->mb)); + clib_memcpy (&t0->buffer, buffer, + sizeof (buffer[0]) - sizeof (buffer->pre_data)); + clib_memcpy (t0->buffer.pre_data, buffer->data + buffer->current_data, + sizeof (t0->buffer.pre_data)); +} + +static_always_inline void +dpdk_validate_rte_mbuf (vlib_main_t * vm, vlib_buffer_t * b, + int maybe_multiseg) +{ + struct rte_mbuf *mb, *first_mb, *last_mb; + + /* buffer is coming from non-dpdk source so we need to init + rte_mbuf header */ + if (PREDICT_FALSE ((b->flags & VLIB_BUFFER_EXT_HDR_VALID) == 0)) + { + vlib_buffer_t *b2 = b; + last_mb = mb = rte_mbuf_from_vlib_buffer (b2); + rte_pktmbuf_reset (mb); + while (maybe_multiseg && (b2->flags & VLIB_BUFFER_NEXT_PRESENT)) + { + b2 = vlib_get_buffer (vm, b2->next_buffer); + mb = rte_mbuf_from_vlib_buffer (b2); + rte_pktmbuf_reset (mb); + } + } + + last_mb = first_mb = mb = rte_mbuf_from_vlib_buffer (b); + first_mb->nb_segs = 1; + mb->data_len = b->current_length; + mb->pkt_len = maybe_multiseg ? vlib_buffer_length_in_chain (vm, b) : + b->current_length; + mb->data_off = VLIB_BUFFER_PRE_DATA_SIZE + b->current_data; + + while (maybe_multiseg && (b->flags & VLIB_BUFFER_NEXT_PRESENT)) + { + b = vlib_get_buffer (vm, b->next_buffer); + mb = rte_mbuf_from_vlib_buffer (b); + last_mb->next = mb; + last_mb = mb; + mb->data_len = b->current_length; + mb->pkt_len = b->current_length; + mb->data_off = VLIB_BUFFER_PRE_DATA_SIZE + b->current_data; + first_mb->nb_segs++; + if (PREDICT_FALSE (b->n_add_refs)) + { + rte_mbuf_refcnt_update (mb, b->n_add_refs); + b->n_add_refs = 0; + } + } +} + +/* + * This function calls the dpdk's tx_burst function to transmit the packets + * on the tx_vector. It manages a lock per-device if the device does not + * support multiple queues. It returns the number of packets untransmitted + * on the tx_vector. If all packets are transmitted (the normal case), the + * function returns 0. + * + * The function assumes there is at least one packet on the tx_vector. + */ +static_always_inline + u32 tx_burst_vector_internal (vlib_main_t * vm, + dpdk_device_t * xd, + struct rte_mbuf **tx_vector) +{ + dpdk_main_t *dm = &dpdk_main; + u32 n_packets; + u32 tx_head; + u32 tx_tail; + u32 n_retry; + int rv; + int queue_id; + tx_ring_hdr_t *ring; + + ring = vec_header (tx_vector, sizeof (*ring)); + + n_packets = ring->tx_head - ring->tx_tail; + + tx_head = ring->tx_head % xd->nb_tx_desc; + + /* + * Ensure rte_eth_tx_burst is not called with 0 packets, which can lead to + * unpredictable results. + */ + ASSERT (n_packets > 0); + + /* + * Check for tx_vector overflow. If this fails it is a system configuration + * error. The ring should be sized big enough to handle the largest un-flowed + * off burst from a traffic manager. A larger size also helps performance + * a bit because it decreases the probability of having to issue two tx_burst + * calls due to a ring wrap. + */ + ASSERT (n_packets < xd->nb_tx_desc); + ASSERT (ring->tx_tail == 0); + + n_retry = 16; + queue_id = vm->thread_index; + + do + { + /* start the burst at the tail */ + tx_tail = ring->tx_tail % xd->nb_tx_desc; + + /* + * This device only supports one TX queue, + * and we're running multi-threaded... + */ + if (PREDICT_FALSE (xd->lockp != 0)) + { + queue_id = queue_id % xd->tx_q_used; + while (__sync_lock_test_and_set (xd->lockp[queue_id], 1)) + /* zzzz */ + queue_id = (queue_id + 1) % xd->tx_q_used; + } + + if (PREDICT_FALSE (xd->flags & DPDK_DEVICE_FLAG_HQOS)) /* HQoS ON */ + { + /* no wrap, transmit in one burst */ + dpdk_device_hqos_per_worker_thread_t *hqos = + &xd->hqos_wt[vm->thread_index]; + + ASSERT (hqos->swq != NULL); + + dpdk_hqos_metadata_set (hqos, + &tx_vector[tx_tail], tx_head - tx_tail); + rv = rte_ring_sp_enqueue_burst (hqos->swq, + (void **) &tx_vector[tx_tail], + (uint16_t) (tx_head - tx_tail), 0); + } + else if (PREDICT_TRUE (xd->flags & DPDK_DEVICE_FLAG_PMD)) + { + /* no wrap, transmit in one burst */ + rv = rte_eth_tx_burst (xd->device_index, + (uint16_t) queue_id, + &tx_vector[tx_tail], + (uint16_t) (tx_head - tx_tail)); + } + else + { + ASSERT (0); + rv = 0; + } + + if (PREDICT_FALSE (xd->lockp != 0)) + *xd->lockp[queue_id] = 0; + + if (PREDICT_FALSE (rv < 0)) + { + // emit non-fatal message, bump counter + vnet_main_t *vnm = dm->vnet_main; + vnet_interface_main_t *im = &vnm->interface_main; + u32 node_index; + + node_index = vec_elt_at_index (im->hw_interfaces, + xd->hw_if_index)->tx_node_index; + + vlib_error_count (vm, node_index, DPDK_TX_FUNC_ERROR_BAD_RETVAL, 1); + clib_warning ("rte_eth_tx_burst[%d]: error %d", xd->device_index, + rv); + return n_packets; // untransmitted packets + } + ring->tx_tail += (u16) rv; + n_packets -= (uint16_t) rv; + } + while (rv && n_packets && (n_retry > 0)); + + return n_packets; +} + +static_always_inline void +dpdk_prefetch_buffer_by_index (vlib_main_t * vm, u32 bi) +{ + vlib_buffer_t *b; + struct rte_mbuf *mb; + b = vlib_get_buffer (vm, bi); + mb = rte_mbuf_from_vlib_buffer (b); + CLIB_PREFETCH (mb, 2 * CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (b, CLIB_CACHE_LINE_BYTES, LOAD); +} + +static_always_inline void +dpdk_buffer_recycle (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_buffer_t * b, u32 bi, struct rte_mbuf **mbp) +{ + dpdk_main_t *dm = &dpdk_main; + u32 my_cpu = vm->thread_index; + struct rte_mbuf *mb_new; + + if (PREDICT_FALSE (b->flags & VLIB_BUFFER_RECYCLE) == 0) + return; + + mb_new = dpdk_replicate_packet_mb (b); + if (PREDICT_FALSE (mb_new == 0)) + { + vlib_error_count (vm, node->node_index, + DPDK_TX_FUNC_ERROR_REPL_FAIL, 1); + b->flags |= VLIB_BUFFER_REPL_FAIL; + } + else + *mbp = mb_new; + + vec_add1 (dm->recycle[my_cpu], bi); +} + +static_always_inline void +dpdk_buffer_tx_offload (dpdk_device_t * xd, vlib_buffer_t * b, + struct rte_mbuf *mb) +{ + u32 ip_cksum = b->flags & VNET_BUFFER_F_OFFLOAD_IP_CKSUM; + u32 tcp_cksum = b->flags & VNET_BUFFER_F_OFFLOAD_TCP_CKSUM; + u32 udp_cksum = b->flags & VNET_BUFFER_F_OFFLOAD_UDP_CKSUM; + int is_ip4 = b->flags & VNET_BUFFER_F_IS_IP4; + u64 ol_flags; + + /* Is there any work for us? */ + if (PREDICT_TRUE ((ip_cksum | tcp_cksum | udp_cksum) == 0)) + return; + + mb->l2_len = vnet_buffer (b)->l3_hdr_offset - b->current_data; + mb->l3_len = vnet_buffer (b)->l4_hdr_offset - + vnet_buffer (b)->l3_hdr_offset; + mb->outer_l3_len = 0; + mb->outer_l2_len = 0; + ol_flags = is_ip4 ? PKT_TX_IPV4 : PKT_TX_IPV6; + ol_flags |= ip_cksum ? PKT_TX_IP_CKSUM : 0; + ol_flags |= tcp_cksum ? PKT_TX_TCP_CKSUM : 0; + ol_flags |= udp_cksum ? PKT_TX_UDP_CKSUM : 0; + mb->ol_flags |= ol_flags; + + /* we are trying to help compiler here by using local ol_flags with known + state of all flags */ + if (xd->flags & DPDK_DEVICE_FLAG_INTEL_PHDR_CKSUM) + rte_net_intel_cksum_flags_prepare (mb, ol_flags); +} + +/* + * Transmits the packets on the frame to the interface associated with the + * node. It first copies packets on the frame to a tx_vector containing the + * rte_mbuf pointers. It then passes this vector to tx_burst_vector_internal + * which calls the dpdk tx_burst function. + */ +static uword +dpdk_interface_tx (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * f) +{ + dpdk_main_t *dm = &dpdk_main; + vnet_interface_output_runtime_t *rd = (void *) node->runtime_data; + dpdk_device_t *xd = vec_elt_at_index (dm->devices, rd->dev_instance); + u32 n_packets = f->n_vectors; + u32 n_left; + u32 *from; + struct rte_mbuf **tx_vector; + u16 i; + u16 nb_tx_desc = xd->nb_tx_desc; + int queue_id; + u32 my_cpu; + u32 tx_pkts = 0; + tx_ring_hdr_t *ring; + u32 n_on_ring; + + my_cpu = vm->thread_index; + + queue_id = my_cpu; + + tx_vector = xd->tx_vectors[queue_id]; + ring = vec_header (tx_vector, sizeof (*ring)); + + n_on_ring = ring->tx_head - ring->tx_tail; + from = vlib_frame_vector_args (f); + + ASSERT (n_packets <= VLIB_FRAME_SIZE); + + if (PREDICT_FALSE (n_on_ring + n_packets > nb_tx_desc)) + { + /* + * Overflowing the ring should never happen. + * If it does then drop the whole frame. + */ + vlib_error_count (vm, node->node_index, DPDK_TX_FUNC_ERROR_RING_FULL, + n_packets); + + while (n_packets--) + { + u32 bi0 = from[n_packets]; + vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0); + struct rte_mbuf *mb0 = rte_mbuf_from_vlib_buffer (b0); + rte_pktmbuf_free (mb0); + } + return n_on_ring; + } + + if (PREDICT_FALSE (dm->tx_pcap_enable)) + { + n_left = n_packets; + while (n_left > 0) + { + u32 bi0 = from[0]; + vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0); + if (dm->pcap_sw_if_index == 0 || + dm->pcap_sw_if_index == vnet_buffer (b0)->sw_if_index[VLIB_TX]) + pcap_add_buffer (&dm->pcap_main, vm, bi0, 512); + from++; + n_left--; + } + } + + from = vlib_frame_vector_args (f); + n_left = n_packets; + i = ring->tx_head % nb_tx_desc; + + while (n_left >= 8) + { + u32 bi0, bi1, bi2, bi3; + struct rte_mbuf *mb0, *mb1, *mb2, *mb3; + vlib_buffer_t *b0, *b1, *b2, *b3; + u32 or_flags; + + dpdk_prefetch_buffer_by_index (vm, from[4]); + dpdk_prefetch_buffer_by_index (vm, from[5]); + dpdk_prefetch_buffer_by_index (vm, from[6]); + dpdk_prefetch_buffer_by_index (vm, from[7]); + + bi0 = from[0]; + bi1 = from[1]; + bi2 = from[2]; + bi3 = from[3]; + from += 4; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + b2 = vlib_get_buffer (vm, bi2); + b3 = vlib_get_buffer (vm, bi3); + + or_flags = b0->flags | b1->flags | b2->flags | b3->flags; + + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b1); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b2); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b3); + + if (or_flags & VLIB_BUFFER_NEXT_PRESENT) + { + dpdk_validate_rte_mbuf (vm, b0, 1); + dpdk_validate_rte_mbuf (vm, b1, 1); + dpdk_validate_rte_mbuf (vm, b2, 1); + dpdk_validate_rte_mbuf (vm, b3, 1); + } + else + { + dpdk_validate_rte_mbuf (vm, b0, 0); + dpdk_validate_rte_mbuf (vm, b1, 0); + dpdk_validate_rte_mbuf (vm, b2, 0); + dpdk_validate_rte_mbuf (vm, b3, 0); + } + + mb0 = rte_mbuf_from_vlib_buffer (b0); + mb1 = rte_mbuf_from_vlib_buffer (b1); + mb2 = rte_mbuf_from_vlib_buffer (b2); + mb3 = rte_mbuf_from_vlib_buffer (b3); + + if (PREDICT_FALSE ((xd->flags & DPDK_DEVICE_FLAG_TX_OFFLOAD) && + (or_flags & + (VNET_BUFFER_F_OFFLOAD_TCP_CKSUM + | VNET_BUFFER_F_OFFLOAD_IP_CKSUM + | VNET_BUFFER_F_OFFLOAD_UDP_CKSUM)))) + { + dpdk_buffer_tx_offload (xd, b0, mb0); + dpdk_buffer_tx_offload (xd, b1, mb1); + dpdk_buffer_tx_offload (xd, b2, mb2); + dpdk_buffer_tx_offload (xd, b3, mb3); + } + + if (PREDICT_FALSE (or_flags & VLIB_BUFFER_RECYCLE)) + { + dpdk_buffer_recycle (vm, node, b0, bi0, &mb0); + dpdk_buffer_recycle (vm, node, b1, bi1, &mb1); + dpdk_buffer_recycle (vm, node, b2, bi2, &mb2); + dpdk_buffer_recycle (vm, node, b3, bi3, &mb3); + + /* dont enqueue packets if replication failed as they must + be sent back to recycle */ + if (PREDICT_TRUE ((b0->flags & VLIB_BUFFER_REPL_FAIL) == 0)) + tx_vector[i++ % nb_tx_desc] = mb0; + if (PREDICT_TRUE ((b1->flags & VLIB_BUFFER_REPL_FAIL) == 0)) + tx_vector[i++ % nb_tx_desc] = mb1; + if (PREDICT_TRUE ((b2->flags & VLIB_BUFFER_REPL_FAIL) == 0)) + tx_vector[i++ % nb_tx_desc] = mb2; + if (PREDICT_TRUE ((b3->flags & VLIB_BUFFER_REPL_FAIL) == 0)) + tx_vector[i++ % nb_tx_desc] = mb3; + } + else + { + if (PREDICT_FALSE (i + 3 >= nb_tx_desc)) + { + tx_vector[i++ % nb_tx_desc] = mb0; + tx_vector[i++ % nb_tx_desc] = mb1; + tx_vector[i++ % nb_tx_desc] = mb2; + tx_vector[i++ % nb_tx_desc] = mb3; + i %= nb_tx_desc; + } + else + { + tx_vector[i++] = mb0; + tx_vector[i++] = mb1; + tx_vector[i++] = mb2; + tx_vector[i++] = mb3; + } + } + + + if (PREDICT_FALSE (node->flags & VLIB_NODE_FLAG_TRACE)) + { + if (b0->flags & VLIB_BUFFER_IS_TRACED) + dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi0, b0); + if (b1->flags & VLIB_BUFFER_IS_TRACED) + dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi1, b1); + if (b2->flags & VLIB_BUFFER_IS_TRACED) + dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi2, b2); + if (b3->flags & VLIB_BUFFER_IS_TRACED) + dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi3, b3); + } + + n_left -= 4; + } + while (n_left > 0) + { + u32 bi0; + struct rte_mbuf *mb0; + vlib_buffer_t *b0; + + bi0 = from[0]; + from++; + + b0 = vlib_get_buffer (vm, bi0); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); + + dpdk_validate_rte_mbuf (vm, b0, 1); + + mb0 = rte_mbuf_from_vlib_buffer (b0); + dpdk_buffer_tx_offload (xd, b0, mb0); + dpdk_buffer_recycle (vm, node, b0, bi0, &mb0); + + if (PREDICT_FALSE (node->flags & VLIB_NODE_FLAG_TRACE)) + if (b0->flags & VLIB_BUFFER_IS_TRACED) + dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi0, b0); + + if (PREDICT_TRUE ((b0->flags & VLIB_BUFFER_REPL_FAIL) == 0)) + { + tx_vector[i % nb_tx_desc] = mb0; + i++; + } + n_left--; + } + + /* account for additional packets in the ring */ + ring->tx_head += n_packets; + n_on_ring = ring->tx_head - ring->tx_tail; + + /* transmit as many packets as possible */ + n_packets = tx_burst_vector_internal (vm, xd, tx_vector); + + /* + * tx_pkts is the number of packets successfully transmitted + * This is the number originally on ring minus the number remaining on ring + */ + tx_pkts = n_on_ring - n_packets; + + { + /* If there is no callback then drop any non-transmitted packets */ + if (PREDICT_FALSE (n_packets)) + { + vlib_simple_counter_main_t *cm; + vnet_main_t *vnm = vnet_get_main (); + + cm = vec_elt_at_index (vnm->interface_main.sw_if_counters, + VNET_INTERFACE_COUNTER_TX_ERROR); + + vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index, + n_packets); + + vlib_error_count (vm, node->node_index, DPDK_TX_FUNC_ERROR_PKT_DROP, + n_packets); + + while (n_packets--) + rte_pktmbuf_free (tx_vector[ring->tx_tail + n_packets]); + } + + /* Reset head/tail to avoid unnecessary wrap */ + ring->tx_head = 0; + ring->tx_tail = 0; + } + + /* Recycle replicated buffers */ + if (PREDICT_FALSE (vec_len (dm->recycle[my_cpu]))) + { + vlib_buffer_free (vm, dm->recycle[my_cpu], + vec_len (dm->recycle[my_cpu])); + _vec_len (dm->recycle[my_cpu]) = 0; + } + + ASSERT (ring->tx_head >= ring->tx_tail); + + return tx_pkts; +} + +static void +dpdk_clear_hw_interface_counters (u32 instance) +{ + dpdk_main_t *dm = &dpdk_main; + dpdk_device_t *xd = vec_elt_at_index (dm->devices, instance); + + /* + * Set the "last_cleared_stats" to the current stats, so that + * things appear to clear from a display perspective. + */ + dpdk_update_counters (xd, vlib_time_now (dm->vlib_main)); + + clib_memcpy (&xd->last_cleared_stats, &xd->stats, sizeof (xd->stats)); + clib_memcpy (xd->last_cleared_xstats, xd->xstats, + vec_len (xd->last_cleared_xstats) * + sizeof (xd->last_cleared_xstats[0])); + +} + +static clib_error_t * +dpdk_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags) +{ + vnet_hw_interface_t *hif = vnet_get_hw_interface (vnm, hw_if_index); + uword is_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0; + dpdk_main_t *dm = &dpdk_main; + dpdk_device_t *xd = vec_elt_at_index (dm->devices, hif->dev_instance); + + if (xd->flags & DPDK_DEVICE_FLAG_PMD_INIT_FAIL) + return clib_error_return (0, "Interface not initialized"); + + if (is_up) + { + vnet_hw_interface_set_flags (vnm, xd->hw_if_index, + VNET_HW_INTERFACE_FLAG_LINK_UP); + if ((xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) == 0) + dpdk_device_start (xd); + xd->flags |= DPDK_DEVICE_FLAG_ADMIN_UP; + f64 now = vlib_time_now (dm->vlib_main); + dpdk_update_counters (xd, now); + dpdk_update_link_state (xd, now); + } + else + { + vnet_hw_interface_set_flags (vnm, xd->hw_if_index, 0); + if ((xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) != 0) + dpdk_device_stop (xd); + xd->flags &= ~DPDK_DEVICE_FLAG_ADMIN_UP; + } + + return /* no error */ 0; +} + +/* + * Dynamically redirect all pkts from a specific interface + * to the specified node + */ +static void +dpdk_set_interface_next_node (vnet_main_t * vnm, u32 hw_if_index, + u32 node_index) +{ + dpdk_main_t *xm = &dpdk_main; + vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); + dpdk_device_t *xd = vec_elt_at_index (xm->devices, hw->dev_instance); + + /* Shut off redirection */ + if (node_index == ~0) + { + xd->per_interface_next_index = node_index; + return; + } + + xd->per_interface_next_index = + vlib_node_add_next (xm->vlib_main, dpdk_input_node.index, node_index); +} + + +static clib_error_t * +dpdk_subif_add_del_function (vnet_main_t * vnm, + u32 hw_if_index, + struct vnet_sw_interface_t *st, int is_add) +{ + dpdk_main_t *xm = &dpdk_main; + vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); + dpdk_device_t *xd = vec_elt_at_index (xm->devices, hw->dev_instance); + vnet_sw_interface_t *t = (vnet_sw_interface_t *) st; + int r, vlan_offload; + u32 prev_subifs = xd->num_subifs; + clib_error_t *err = 0; + + if (is_add) + xd->num_subifs++; + else if (xd->num_subifs) + xd->num_subifs--; + + if ((xd->flags & DPDK_DEVICE_FLAG_PMD) == 0) + goto done; + + /* currently we program VLANS only for IXGBE VF and I40E VF */ + if ((xd->pmd != VNET_DPDK_PMD_IXGBEVF) && (xd->pmd != VNET_DPDK_PMD_I40EVF)) + goto done; + + if (t->sub.eth.flags.no_tags == 1) + goto done; + + if ((t->sub.eth.flags.one_tag != 1) || (t->sub.eth.flags.exact_match != 1)) + { + xd->num_subifs = prev_subifs; + err = clib_error_return (0, "unsupported VLAN setup"); + goto done; + } + + vlan_offload = rte_eth_dev_get_vlan_offload (xd->device_index); + vlan_offload |= ETH_VLAN_FILTER_OFFLOAD; + + if ((r = rte_eth_dev_set_vlan_offload (xd->device_index, vlan_offload))) + { + xd->num_subifs = prev_subifs; + err = clib_error_return (0, "rte_eth_dev_set_vlan_offload[%d]: err %d", + xd->device_index, r); + goto done; + } + + + if ((r = + rte_eth_dev_vlan_filter (xd->device_index, t->sub.eth.outer_vlan_id, + is_add))) + { + xd->num_subifs = prev_subifs; + err = clib_error_return (0, "rte_eth_dev_vlan_filter[%d]: err %d", + xd->device_index, r); + goto done; + } + +done: + if (xd->num_subifs) + xd->flags |= DPDK_DEVICE_FLAG_HAVE_SUBIF; + else + xd->flags &= ~DPDK_DEVICE_FLAG_HAVE_SUBIF; + + return err; +} + +/* *INDENT-OFF* */ +VNET_DEVICE_CLASS (dpdk_device_class) = { + .name = "dpdk", + .tx_function = dpdk_interface_tx, + .tx_function_n_errors = DPDK_TX_FUNC_N_ERROR, + .tx_function_error_strings = dpdk_tx_func_error_strings, + .format_device_name = format_dpdk_device_name, + .format_device = format_dpdk_device, + .format_tx_trace = format_dpdk_tx_dma_trace, + .clear_counters = dpdk_clear_hw_interface_counters, + .admin_up_down_function = dpdk_interface_admin_up_down, + .subif_add_del_function = dpdk_subif_add_del_function, + .rx_redirect_to_node = dpdk_set_interface_next_node, + .mac_addr_change_function = dpdk_set_mac_address, +}; + +VLIB_DEVICE_TX_FUNCTION_MULTIARCH (dpdk_device_class, dpdk_interface_tx) +/* *INDENT-ON* */ + +#define UP_DOWN_FLAG_EVENT 1 + +uword +admin_up_down_process (vlib_main_t * vm, + vlib_node_runtime_t * rt, vlib_frame_t * f) +{ + clib_error_t *error = 0; + uword event_type; + uword *event_data = 0; + u32 sw_if_index; + u32 flags; + + while (1) + { + vlib_process_wait_for_event (vm); + + event_type = vlib_process_get_events (vm, &event_data); + + dpdk_main.admin_up_down_in_progress = 1; + + switch (event_type) + { + case UP_DOWN_FLAG_EVENT: + { + if (vec_len (event_data) == 2) + { + sw_if_index = event_data[0]; + flags = event_data[1]; + error = + vnet_sw_interface_set_flags (vnet_get_main (), sw_if_index, + flags); + clib_error_report (error); + } + } + break; + } + + vec_reset_length (event_data); + + dpdk_main.admin_up_down_in_progress = 0; + + } + return 0; /* or not */ +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (admin_up_down_process_node,static) = { + .function = admin_up_down_process, + .type = VLIB_NODE_TYPE_PROCESS, + .name = "admin-up-down-process", + .process_log2_n_stack_bytes = 17, // 256KB +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/dpdk/device/dir.dox b/src/plugins/dpdk/device/dir.dox new file mode 100644 index 00000000..43e36753 --- /dev/null +++ b/src/plugins/dpdk/device/dir.dox @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Doxygen directory documentation */ + +/** +@dir +@brief DPDK Abstraction Layer. + +This directory contains the source code for the DPDK abstraction layer. + +*/ +/*? %%clicmd:group_label DPDK and pcap tx %% ?*/ +/*? %%syscfg:group_label DPDK and pcap tx %% ?*/ diff --git a/src/plugins/dpdk/device/dpdk.h b/src/plugins/dpdk/device/dpdk.h new file mode 100644 index 00000000..9762c713 --- /dev/null +++ b/src/plugins/dpdk/device/dpdk.h @@ -0,0 +1,483 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_dpdk_h__ +#define __included_dpdk_h__ + +/* $$$$ We should rename always_inline -> clib_always_inline */ +#undef always_inline + +#include <rte_config.h> + +#include <rte_common.h> +#include <rte_dev.h> +#include <rte_memory.h> +#include <rte_eal.h> +#include <rte_per_lcore.h> +#include <rte_cycles.h> +#include <rte_lcore.h> +#include <rte_per_lcore.h> +#include <rte_interrupts.h> +#include <rte_pci.h> +#include <rte_ether.h> +#include <rte_ethdev.h> +#include <rte_ring.h> +#include <rte_mempool.h> +#include <rte_mbuf.h> +#include <rte_version.h> +#include <rte_eth_bond.h> +#include <rte_sched.h> +#include <rte_net.h> + +#include <vnet/unix/pcap.h> +#include <vnet/devices/devices.h> + +#if CLIB_DEBUG > 0 +#define always_inline static inline +#else +#define always_inline static inline __attribute__ ((__always_inline__)) +#endif + +#include <vlib/pci/pci.h> + +#define NB_MBUF (16<<10) + +extern vnet_device_class_t dpdk_device_class; +extern vlib_node_registration_t dpdk_input_node; + +#define foreach_dpdk_pmd \ + _ ("net_thunderx", THUNDERX) \ + _ ("net_e1000_em", E1000EM) \ + _ ("net_e1000_igb", IGB) \ + _ ("net_e1000_igb_vf", IGBVF) \ + _ ("net_ixgbe", IXGBE) \ + _ ("net_ixgbe_vf", IXGBEVF) \ + _ ("net_i40e", I40E) \ + _ ("net_i40e_vf", I40EVF) \ + _ ("net_virtio", VIRTIO) \ + _ ("net_enic", ENIC) \ + _ ("net_vmxnet3", VMXNET3) \ + _ ("AF_PACKET PMD", AF_PACKET) \ + _ ("net_bonding", BOND) \ + _ ("net_fm10k", FM10K) \ + _ ("net_cxgbe", CXGBE) \ + _ ("net_mlx4", MLX4) \ + _ ("net_mlx5", MLX5) \ + _ ("net_dpaa2", DPAA2) \ + _ ("net_virtio_user", VIRTIO_USER) \ + _ ("net_vhost", VHOST_ETHER) + +typedef enum +{ + VNET_DPDK_PMD_NONE, +#define _(s,f) VNET_DPDK_PMD_##f, + foreach_dpdk_pmd +#undef _ + VNET_DPDK_PMD_UNKNOWN, /* must be last */ +} dpdk_pmd_t; + +typedef enum +{ + VNET_DPDK_PORT_TYPE_ETH_1G, + VNET_DPDK_PORT_TYPE_ETH_10G, + VNET_DPDK_PORT_TYPE_ETH_25G, + VNET_DPDK_PORT_TYPE_ETH_40G, + VNET_DPDK_PORT_TYPE_ETH_50G, + VNET_DPDK_PORT_TYPE_ETH_100G, + VNET_DPDK_PORT_TYPE_ETH_BOND, + VNET_DPDK_PORT_TYPE_ETH_SWITCH, + VNET_DPDK_PORT_TYPE_AF_PACKET, + VNET_DPDK_PORT_TYPE_ETH_VF, + VNET_DPDK_PORT_TYPE_VIRTIO_USER, + VNET_DPDK_PORT_TYPE_VHOST_ETHER, + VNET_DPDK_PORT_TYPE_UNKNOWN, +} dpdk_port_type_t; + +/* + * The header for the tx_vector in dpdk_device_t. + * Head and tail are indexes into the tx_vector and are of type + * u64 so they never overflow. + */ +typedef struct +{ + u64 tx_head; + u64 tx_tail; +} tx_ring_hdr_t; + +typedef struct +{ + struct rte_ring *swq; + + u64 hqos_field0_slabmask; + u32 hqos_field0_slabpos; + u32 hqos_field0_slabshr; + u64 hqos_field1_slabmask; + u32 hqos_field1_slabpos; + u32 hqos_field1_slabshr; + u64 hqos_field2_slabmask; + u32 hqos_field2_slabpos; + u32 hqos_field2_slabshr; + u32 hqos_tc_table[64]; +} dpdk_device_hqos_per_worker_thread_t; + +typedef struct +{ + struct rte_ring **swq; + struct rte_mbuf **pkts_enq; + struct rte_mbuf **pkts_deq; + struct rte_sched_port *hqos; + u32 hqos_burst_enq; + u32 hqos_burst_deq; + u32 pkts_enq_len; + u32 swq_pos; + u32 flush_count; +} dpdk_device_hqos_per_hqos_thread_t; + +typedef struct +{ + CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); + volatile u32 **lockp; + + /* Instance ID */ + u32 device_index; + + u32 hw_if_index; + u32 vlib_sw_if_index; + + /* next node index if we decide to steal the rx graph arc */ + u32 per_interface_next_index; + + /* dpdk rte_mbuf rx and tx vectors, VLIB_FRAME_SIZE */ + struct rte_mbuf ***tx_vectors; /* one per worker thread */ + struct rte_mbuf ***rx_vectors; + + /* vector of traced contexts, per device */ + u32 **d_trace_buffers; + + dpdk_pmd_t pmd:8; + i8 cpu_socket; + + u16 flags; +#define DPDK_DEVICE_FLAG_ADMIN_UP (1 << 0) +#define DPDK_DEVICE_FLAG_PROMISC (1 << 1) +#define DPDK_DEVICE_FLAG_PMD (1 << 2) +#define DPDK_DEVICE_FLAG_PMD_INIT_FAIL (1 << 3) +#define DPDK_DEVICE_FLAG_MAYBE_MULTISEG (1 << 4) +#define DPDK_DEVICE_FLAG_HAVE_SUBIF (1 << 5) +#define DPDK_DEVICE_FLAG_HQOS (1 << 6) +#define DPDK_DEVICE_FLAG_BOND_SLAVE (1 << 7) +#define DPDK_DEVICE_FLAG_BOND_SLAVE_UP (1 << 8) +#define DPDK_DEVICE_FLAG_TX_OFFLOAD (1 << 9) +#define DPDK_DEVICE_FLAG_INTEL_PHDR_CKSUM (1 << 10) + + u16 nb_tx_desc; + CLIB_CACHE_LINE_ALIGN_MARK (cacheline1); + + u8 *interface_name_suffix; + + /* number of sub-interfaces */ + u16 num_subifs; + + /* PMD related */ + u16 tx_q_used; + u16 rx_q_used; + u16 nb_rx_desc; + u16 *cpu_socket_id_by_queue; + struct rte_eth_conf port_conf; + struct rte_eth_txconf tx_conf; + + /* HQoS related */ + dpdk_device_hqos_per_worker_thread_t *hqos_wt; + dpdk_device_hqos_per_hqos_thread_t *hqos_ht; + + /* af_packet or BondEthernet instance number */ + u8 port_id; + + /* Bonded interface port# of a slave - + only valid if DPDK_DEVICE_FLAG_BOND_SLAVE bit is set */ + u8 bond_port; + + struct rte_eth_link link; + f64 time_last_link_update; + + struct rte_eth_stats stats; + struct rte_eth_stats last_stats; + struct rte_eth_stats last_cleared_stats; + struct rte_eth_xstat *xstats; + struct rte_eth_xstat *last_cleared_xstats; + f64 time_last_stats_update; + dpdk_port_type_t port_type; + + /* mac address */ + u8 *default_mac_address; + + /* error string */ + clib_error_t *errors; +} dpdk_device_t; + +#define DPDK_STATS_POLL_INTERVAL (10.0) +#define DPDK_MIN_STATS_POLL_INTERVAL (0.001) /* 1msec */ + +#define DPDK_LINK_POLL_INTERVAL (3.0) +#define DPDK_MIN_LINK_POLL_INTERVAL (0.001) /* 1msec */ + +typedef struct +{ + u32 device; + u16 queue_id; +} dpdk_device_and_queue_t; + +#ifndef DPDK_HQOS_DBG_BYPASS +#define DPDK_HQOS_DBG_BYPASS 0 +#endif + +#ifndef HQOS_FLUSH_COUNT_THRESHOLD +#define HQOS_FLUSH_COUNT_THRESHOLD 100000 +#endif + +typedef struct dpdk_device_config_hqos_t +{ + u32 hqos_thread; + u32 hqos_thread_valid; + + u32 swq_size; + u32 burst_enq; + u32 burst_deq; + + u32 pktfield0_slabpos; + u32 pktfield1_slabpos; + u32 pktfield2_slabpos; + u64 pktfield0_slabmask; + u64 pktfield1_slabmask; + u64 pktfield2_slabmask; + u32 tc_table[64]; + + struct rte_sched_port_params port; + struct rte_sched_subport_params *subport; + struct rte_sched_pipe_params *pipe; + uint32_t *pipe_map; +} dpdk_device_config_hqos_t; + +int dpdk_hqos_validate_mask (u64 mask, u32 n); +void dpdk_device_config_hqos_pipe_profile_default (dpdk_device_config_hqos_t * + hqos, u32 pipe_profile_id); +void dpdk_device_config_hqos_default (dpdk_device_config_hqos_t * hqos); +clib_error_t *dpdk_port_setup_hqos (dpdk_device_t * xd, + dpdk_device_config_hqos_t * hqos); +void dpdk_hqos_metadata_set (dpdk_device_hqos_per_worker_thread_t * hqos, + struct rte_mbuf **pkts, u32 n_pkts); + +#define foreach_dpdk_device_config_item \ + _ (num_rx_queues) \ + _ (num_tx_queues) \ + _ (num_rx_desc) \ + _ (num_tx_desc) \ + _ (rss_fn) + +typedef struct +{ + vlib_pci_addr_t pci_addr; + u8 is_blacklisted; + u8 vlan_strip_offload; +#define DPDK_DEVICE_VLAN_STRIP_DEFAULT 0 +#define DPDK_DEVICE_VLAN_STRIP_OFF 1 +#define DPDK_DEVICE_VLAN_STRIP_ON 2 + +#define _(x) uword x; + foreach_dpdk_device_config_item +#undef _ + clib_bitmap_t * workers; + u32 hqos_enabled; + dpdk_device_config_hqos_t hqos; +} dpdk_device_config_t; + +typedef struct +{ + + /* Config stuff */ + u8 **eal_init_args; + u8 *eal_init_args_str; + u8 *uio_driver_name; + u8 no_multi_seg; + u8 enable_tcp_udp_checksum; + + /* Required config parameters */ + u8 coremask_set_manually; + u8 nchannels_set_manually; + u32 coremask; + u32 nchannels; + u32 num_mbufs; + + /* + * format interface names ala xxxEthernet%d/%d/%d instead of + * xxxEthernet%x/%x/%x. + */ + u8 interface_name_format_decimal; + + /* per-device config */ + dpdk_device_config_t default_devconf; + dpdk_device_config_t *dev_confs; + uword *device_config_index_by_pci_addr; + +} dpdk_config_main_t; + +dpdk_config_main_t dpdk_config_main; + +typedef struct +{ + + /* Devices */ + dpdk_device_t *devices; + dpdk_device_and_queue_t **devices_by_hqos_cpu; + + /* per-thread recycle lists */ + u32 **recycle; + + /* per-thread buffer templates */ + vlib_buffer_t *buffer_templates; + + /* buffer flags template, configurable to enable/disable tcp / udp cksum */ + u32 buffer_flags_template; + + /* vlib buffer free list, must be same size as an rte_mbuf */ + u32 vlib_buffer_free_list_index; + + /* Ethernet input node index */ + u32 ethernet_input_node_index; + + /* pcap tracing [only works if (CLIB_DEBUG > 0)] */ + int tx_pcap_enable; + pcap_main_t pcap_main; + u8 *pcap_filename; + u32 pcap_sw_if_index; + u32 pcap_pkts_to_capture; + + /* + * flag indicating that a posted admin up/down + * (via post_sw_interface_set_flags) is in progress + */ + u8 admin_up_down_in_progress; + + u8 use_rss; + + /* which cpus are running I/O TX */ + int hqos_cpu_first_index; + int hqos_cpu_count; + + /* control interval of dpdk link state and stat polling */ + f64 link_state_poll_interval; + f64 stat_poll_interval; + + /* Sleep for this many usec after each device poll */ + u32 poll_sleep_usec; + + /* convenience */ + vlib_main_t *vlib_main; + vnet_main_t *vnet_main; + dpdk_config_main_t *conf; + + /* mempool */ + struct rte_mempool **pktmbuf_pools; + + /* API message ID base */ + u16 msg_id_base; +} dpdk_main_t; + +extern dpdk_main_t dpdk_main; + +typedef struct +{ + u32 buffer_index; + u16 device_index; + u8 queue_index; + struct rte_mbuf mb; + /* Copy of VLIB buffer; packet data stored in pre_data. */ + vlib_buffer_t buffer; +} dpdk_tx_dma_trace_t; + +typedef struct +{ + u32 buffer_index; + u16 device_index; + u16 queue_index; + struct rte_mbuf mb; + vlib_buffer_t buffer; /* Copy of VLIB buffer; pkt data stored in pre_data. */ + u8 data[256]; /* First 256 data bytes, used for hexdump */ +} dpdk_rx_dma_trace_t; + +void dpdk_device_setup (dpdk_device_t * xd); +void dpdk_device_start (dpdk_device_t * xd); +void dpdk_device_stop (dpdk_device_t * xd); + +#if DPDK_VOID_CALLBACK +void dpdk_port_state_callback (uint8_t port_id, + enum rte_eth_event_type type, void *param); +#else +int dpdk_port_state_callback (uint8_t port_id, + enum rte_eth_event_type type, + void *param, void *ret_param); +#endif + +#define foreach_dpdk_error \ + _(NONE, "no error") \ + _(RX_PACKET_ERROR, "Rx packet errors") \ + _(RX_BAD_FCS, "Rx bad fcs") \ + _(IP_CHECKSUM_ERROR, "Rx ip checksum errors") \ + _(RX_ALLOC_FAIL, "rx buf alloc from free list failed") \ + _(RX_ALLOC_NO_PHYSMEM, "rx buf alloc failed no physmem") \ + _(RX_ALLOC_DROP_PKTS, "rx packets dropped due to alloc error") + +typedef enum +{ +#define _(f,s) DPDK_ERROR_##f, + foreach_dpdk_error +#undef _ + DPDK_N_ERROR, +} dpdk_error_t; + +void dpdk_update_link_state (dpdk_device_t * xd, f64 now); + +format_function_t format_dpdk_device_name; +format_function_t format_dpdk_device; +format_function_t format_dpdk_device_errors; +format_function_t format_dpdk_tx_dma_trace; +format_function_t format_dpdk_rx_dma_trace; +format_function_t format_dpdk_rte_mbuf; +format_function_t format_dpdk_rx_rte_mbuf; +unformat_function_t unformat_dpdk_log_level; +clib_error_t *unformat_rss_fn (unformat_input_t * input, uword * rss_fn); +clib_error_t *unformat_hqos (unformat_input_t * input, + dpdk_device_config_hqos_t * hqos); + +uword +admin_up_down_process (vlib_main_t * vm, + vlib_node_runtime_t * rt, vlib_frame_t * f); + +clib_error_t *dpdk_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs, + unsigned socket_id); + +#if CLI_DEBUG +int dpdk_buffer_validate_trajectory_all (u32 * uninitialized); +void dpdk_buffer_poison_trajectory_all (void); +#endif + +#endif /* __included_dpdk_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/dpdk/device/dpdk_priv.h b/src/plugins/dpdk/device/dpdk_priv.h new file mode 100644 index 00000000..52b4ca4b --- /dev/null +++ b/src/plugins/dpdk/device/dpdk_priv.h @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define rte_mbuf_from_vlib_buffer(x) (((struct rte_mbuf *)x) - 1) +#define vlib_buffer_from_rte_mbuf(x) ((vlib_buffer_t *)(x+1)) + +#define DPDK_NB_RX_DESC_DEFAULT 1024 +#define DPDK_NB_TX_DESC_DEFAULT 1024 +#define DPDK_NB_RX_DESC_VIRTIO 256 +#define DPDK_NB_TX_DESC_VIRTIO 256 + +#define I40E_DEV_ID_SFP_XL710 0x1572 +#define I40E_DEV_ID_QSFP_A 0x1583 +#define I40E_DEV_ID_QSFP_B 0x1584 +#define I40E_DEV_ID_QSFP_C 0x1585 +#define I40E_DEV_ID_10G_BASE_T 0x1586 +#define I40E_DEV_ID_VF 0x154C + +/* These args appear by themselves */ +#define foreach_eal_double_hyphen_predicate_arg \ +_(no-shconf) \ +_(no-hpet) \ +_(no-huge) \ +_(vmware-tsc-map) + +#define foreach_eal_single_hyphen_mandatory_arg \ +_(coremask, c) \ +_(nchannels, n) \ + +#define foreach_eal_single_hyphen_arg \ +_(blacklist, b) \ +_(mem-alloc-request, m) \ +_(force-ranks, r) + +/* These args are preceeded by "--" and followed by a single string */ +#define foreach_eal_double_hyphen_arg \ +_(huge-dir) \ +_(proc-type) \ +_(file-prefix) \ +_(vdev) + +static inline void +dpdk_get_xstats (dpdk_device_t * xd) +{ + int len; + if ((len = rte_eth_xstats_get (xd->device_index, NULL, 0)) > 0) + { + vec_validate (xd->xstats, len - 1); + vec_validate (xd->last_cleared_xstats, len - 1); + + len = + rte_eth_xstats_get (xd->device_index, xd->xstats, + vec_len (xd->xstats)); + + ASSERT (vec_len (xd->xstats) == len); + ASSERT (vec_len (xd->last_cleared_xstats) == len); + + _vec_len (xd->xstats) = len; + _vec_len (xd->last_cleared_xstats) = len; + + } +} + + +static inline void +dpdk_update_counters (dpdk_device_t * xd, f64 now) +{ + vlib_simple_counter_main_t *cm; + vnet_main_t *vnm = vnet_get_main (); + u32 thread_index = vlib_get_thread_index (); + u64 rxerrors, last_rxerrors; + + /* only update counters for PMD interfaces */ + if ((xd->flags & DPDK_DEVICE_FLAG_PMD) == 0) + return; + + xd->time_last_stats_update = now ? now : xd->time_last_stats_update; + clib_memcpy (&xd->last_stats, &xd->stats, sizeof (xd->last_stats)); + rte_eth_stats_get (xd->device_index, &xd->stats); + + /* maybe bump interface rx no buffer counter */ + if (PREDICT_FALSE (xd->stats.rx_nombuf != xd->last_stats.rx_nombuf)) + { + cm = vec_elt_at_index (vnm->interface_main.sw_if_counters, + VNET_INTERFACE_COUNTER_RX_NO_BUF); + + vlib_increment_simple_counter (cm, thread_index, xd->vlib_sw_if_index, + xd->stats.rx_nombuf - + xd->last_stats.rx_nombuf); + } + + /* missed pkt counter */ + if (PREDICT_FALSE (xd->stats.imissed != xd->last_stats.imissed)) + { + cm = vec_elt_at_index (vnm->interface_main.sw_if_counters, + VNET_INTERFACE_COUNTER_RX_MISS); + + vlib_increment_simple_counter (cm, thread_index, xd->vlib_sw_if_index, + xd->stats.imissed - + xd->last_stats.imissed); + } + rxerrors = xd->stats.ierrors; + last_rxerrors = xd->last_stats.ierrors; + + if (PREDICT_FALSE (rxerrors != last_rxerrors)) + { + cm = vec_elt_at_index (vnm->interface_main.sw_if_counters, + VNET_INTERFACE_COUNTER_RX_ERROR); + + vlib_increment_simple_counter (cm, thread_index, xd->vlib_sw_if_index, + rxerrors - last_rxerrors); + } + + dpdk_get_xstats (xd); +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/dpdk/device/format.c b/src/plugins/dpdk/device/format.c new file mode 100644 index 00000000..697bdbe5 --- /dev/null +++ b/src/plugins/dpdk/device/format.c @@ -0,0 +1,804 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vnet/vnet.h> +#include <vppinfra/vec.h> +#include <vppinfra/format.h> +#include <vlib/unix/cj.h> +#include <assert.h> + +#include <vnet/ethernet/ethernet.h> +#include <dpdk/device/dpdk.h> + +#include <dpdk/device/dpdk_priv.h> +#include <vppinfra/error.h> + +#define foreach_dpdk_counter \ + _ (tx_frames_ok, opackets) \ + _ (tx_bytes_ok, obytes) \ + _ (tx_errors, oerrors) \ + _ (rx_frames_ok, ipackets) \ + _ (rx_bytes_ok, ibytes) \ + _ (rx_errors, ierrors) \ + _ (rx_missed, imissed) \ + _ (rx_no_bufs, rx_nombuf) + +#define foreach_dpdk_q_counter \ + _ (rx_frames_ok, q_ipackets) \ + _ (tx_frames_ok, q_opackets) \ + _ (rx_bytes_ok, q_ibytes) \ + _ (tx_bytes_ok, q_obytes) \ + _ (rx_errors, q_errors) + +#define foreach_dpdk_rss_hf \ + _(ETH_RSS_FRAG_IPV4, "ipv4-frag") \ + _(ETH_RSS_NONFRAG_IPV4_TCP, "ipv4-tcp") \ + _(ETH_RSS_NONFRAG_IPV4_UDP, "ipv4-udp") \ + _(ETH_RSS_NONFRAG_IPV4_SCTP, "ipv4-sctp") \ + _(ETH_RSS_NONFRAG_IPV4_OTHER, "ipv4-other") \ + _(ETH_RSS_IPV4, "ipv4") \ + _(ETH_RSS_IPV6_TCP_EX, "ipv6-tcp-ex") \ + _(ETH_RSS_IPV6_UDP_EX, "ipv6-udp-ex") \ + _(ETH_RSS_FRAG_IPV6, "ipv6-frag") \ + _(ETH_RSS_NONFRAG_IPV6_TCP, "ipv6-tcp") \ + _(ETH_RSS_NONFRAG_IPV6_UDP, "ipv6-udp") \ + _(ETH_RSS_NONFRAG_IPV6_SCTP, "ipv6-sctp") \ + _(ETH_RSS_NONFRAG_IPV6_OTHER, "ipv6-other") \ + _(ETH_RSS_L2_PAYLOAD, "l2-payload") \ + _(ETH_RSS_IPV6_EX, "ipv6-ex") \ + _(ETH_RSS_IPV6, "ipv6") + + +#define foreach_dpdk_rx_offload_caps \ + _(DEV_RX_OFFLOAD_VLAN_STRIP, "vlan-strip") \ + _(DEV_RX_OFFLOAD_IPV4_CKSUM, "ipv4-cksum") \ + _(DEV_RX_OFFLOAD_UDP_CKSUM , "udp-cksum") \ + _(DEV_RX_OFFLOAD_TCP_CKSUM , "tcp-cksum") \ + _(DEV_RX_OFFLOAD_TCP_LRO , "rcp-lro") \ + _(DEV_RX_OFFLOAD_QINQ_STRIP, "qinq-strip") + +#define foreach_dpdk_tx_offload_caps \ + _(DEV_TX_OFFLOAD_VLAN_INSERT, "vlan-insert") \ + _(DEV_TX_OFFLOAD_IPV4_CKSUM, "ipv4-cksum") \ + _(DEV_TX_OFFLOAD_UDP_CKSUM , "udp-cksum") \ + _(DEV_TX_OFFLOAD_TCP_CKSUM , "tcp-cksum") \ + _(DEV_TX_OFFLOAD_SCTP_CKSUM , "sctp-cksum") \ + _(DEV_TX_OFFLOAD_TCP_TSO , "tcp-tso") \ + _(DEV_TX_OFFLOAD_UDP_TSO , "udp-tso") \ + _(DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM, "outer-ipv4-cksum") \ + _(DEV_TX_OFFLOAD_QINQ_INSERT, "qinq-insert") + +#define foreach_dpdk_pkt_rx_offload_flag \ + _ (PKT_RX_VLAN_PKT, "RX packet is a 802.1q VLAN packet") \ + _ (PKT_RX_RSS_HASH, "RX packet with RSS hash result") \ + _ (PKT_RX_FDIR, "RX packet with FDIR infos") \ + _ (PKT_RX_L4_CKSUM_BAD, "L4 cksum of RX pkt. is not OK") \ + _ (PKT_RX_IP_CKSUM_BAD, "IP cksum of RX pkt. is not OK") \ + _ (PKT_RX_VLAN_STRIPPED, "RX packet VLAN tag stripped") \ + _ (PKT_RX_IP_CKSUM_GOOD, "IP cksum of RX pkt. is valid") \ + _ (PKT_RX_L4_CKSUM_GOOD, "L4 cksum of RX pkt. is valid") \ + _ (PKT_RX_IEEE1588_PTP, "RX IEEE1588 L2 Ethernet PT Packet") \ + _ (PKT_RX_IEEE1588_TMST, "RX IEEE1588 L2/L4 timestamped packet") \ + _ (PKT_RX_QINQ_STRIPPED, "RX packet QinQ tags stripped") + +#define foreach_dpdk_pkt_type \ + _ (L2, ETHER, "Ethernet packet") \ + _ (L2, ETHER_TIMESYNC, "Ethernet packet for time sync") \ + _ (L2, ETHER_ARP, "ARP packet") \ + _ (L2, ETHER_LLDP, "LLDP (Link Layer Discovery Protocol) packet") \ + _ (L2, ETHER_NSH, "NSH (Network Service Header) packet") \ + _ (L2, ETHER_VLAN, "VLAN packet") \ + _ (L2, ETHER_QINQ, "QinQ packet") \ + _ (L3, IPV4, "IPv4 packet without extension headers") \ + _ (L3, IPV4_EXT, "IPv4 packet with extension headers") \ + _ (L3, IPV4_EXT_UNKNOWN, "IPv4 packet with or without extension headers") \ + _ (L3, IPV6, "IPv6 packet without extension headers") \ + _ (L3, IPV6_EXT, "IPv6 packet with extension headers") \ + _ (L3, IPV6_EXT_UNKNOWN, "IPv6 packet with or without extension headers") \ + _ (L4, TCP, "TCP packet") \ + _ (L4, UDP, "UDP packet") \ + _ (L4, FRAG, "Fragmented IP packet") \ + _ (L4, SCTP, "SCTP (Stream Control Transmission Protocol) packet") \ + _ (L4, ICMP, "ICMP packet") \ + _ (L4, NONFRAG, "Non-fragmented IP packet") \ + _ (TUNNEL, GRE, "GRE tunneling packet") \ + _ (TUNNEL, VXLAN, "VXLAN tunneling packet") \ + _ (TUNNEL, NVGRE, "NVGRE Tunneling packet") \ + _ (TUNNEL, GENEVE, "GENEVE Tunneling packet") \ + _ (TUNNEL, GRENAT, "Teredo, VXLAN or GRE Tunneling packet") \ + _ (INNER_L2, ETHER, "Inner Ethernet packet") \ + _ (INNER_L2, ETHER_VLAN, "Inner Ethernet packet with VLAN") \ + _ (INNER_L3, IPV4, "Inner IPv4 packet without extension headers") \ + _ (INNER_L3, IPV4_EXT, "Inner IPv4 packet with extension headers") \ + _ (INNER_L3, IPV4_EXT_UNKNOWN, "Inner IPv4 packet with or without extension headers") \ + _ (INNER_L3, IPV6, "Inner IPv6 packet without extension headers") \ + _ (INNER_L3, IPV6_EXT, "Inner IPv6 packet with extension headers") \ + _ (INNER_L3, IPV6_EXT_UNKNOWN, "Inner IPv6 packet with or without extension headers") \ + _ (INNER_L4, TCP, "Inner TCP packet") \ + _ (INNER_L4, UDP, "Inner UDP packet") \ + _ (INNER_L4, FRAG, "Inner fagmented IP packet") \ + _ (INNER_L4, SCTP, "Inner SCTP (Stream Control Transmission Protocol) packet") \ + _ (INNER_L4, ICMP, "Inner ICMP packet") \ + _ (INNER_L4, NONFRAG, "Inner non-fragmented IP packet") + +#define foreach_dpdk_pkt_tx_offload_flag \ + _ (PKT_TX_VLAN_PKT, "TX packet is a 802.1q VLAN packet") \ + _ (PKT_TX_IP_CKSUM, "IP cksum of TX pkt. computed by NIC") \ + _ (PKT_TX_TCP_CKSUM, "TCP cksum of TX pkt. computed by NIC") \ + _ (PKT_TX_SCTP_CKSUM, "SCTP cksum of TX pkt. computed by NIC") \ + _ (PKT_TX_IEEE1588_TMST, "TX IEEE1588 packet to timestamp") + +#define foreach_dpdk_pkt_offload_flag \ + foreach_dpdk_pkt_rx_offload_flag \ + foreach_dpdk_pkt_tx_offload_flag + +#define foreach_dpdk_log_level \ + _ (EMERG, "emergency") \ + _ (ALERT, "alert") \ + _ (CRIT, "critical") \ + _ (ERR, "error") \ + _ (WARNING, "warning") \ + _ (NOTICE, "notice") \ + _ (INFO, "info") \ + _ (DEBUG, "debug") + +u8 * +format_dpdk_device_name (u8 * s, va_list * args) +{ + dpdk_main_t *dm = &dpdk_main; + char *devname_format; + char *device_name; + u32 i = va_arg (*args, u32); + struct rte_eth_dev_info dev_info; + u8 *ret; + + if (dm->conf->interface_name_format_decimal) + devname_format = "%s%d/%d/%d"; + else + devname_format = "%s%x/%x/%x"; + + switch (dm->devices[i].port_type) + { + case VNET_DPDK_PORT_TYPE_ETH_1G: + device_name = "GigabitEthernet"; + break; + + case VNET_DPDK_PORT_TYPE_ETH_10G: + device_name = "TenGigabitEthernet"; + break; + + case VNET_DPDK_PORT_TYPE_ETH_25G: + device_name = "TwentyFiveGigabitEthernet"; + break; + + case VNET_DPDK_PORT_TYPE_ETH_40G: + device_name = "FortyGigabitEthernet"; + break; + + case VNET_DPDK_PORT_TYPE_ETH_50G: + device_name = "FiftyGigabitEthernet"; + break; + + case VNET_DPDK_PORT_TYPE_ETH_100G: + device_name = "HundredGigabitEthernet"; + break; + + case VNET_DPDK_PORT_TYPE_ETH_BOND: + return format (s, "BondEthernet%d", dm->devices[i].port_id); + + case VNET_DPDK_PORT_TYPE_ETH_SWITCH: + device_name = "EthernetSwitch"; + break; + + case VNET_DPDK_PORT_TYPE_ETH_VF: + device_name = "VirtualFunctionEthernet"; + break; + + case VNET_DPDK_PORT_TYPE_AF_PACKET: + rte_eth_dev_info_get (i, &dev_info); + return format (s, "af_packet%d", dm->devices[i].port_id); + + case VNET_DPDK_PORT_TYPE_VIRTIO_USER: + device_name = "VirtioUser"; + break; + + case VNET_DPDK_PORT_TYPE_VHOST_ETHER: + device_name = "VhostEthernet"; + break; + + default: + case VNET_DPDK_PORT_TYPE_UNKNOWN: + device_name = "UnknownEthernet"; + break; + } + + rte_eth_dev_info_get (i, &dev_info); + + if (dev_info.pci_dev) + ret = format (s, devname_format, device_name, dev_info.pci_dev->addr.bus, + dev_info.pci_dev->addr.devid, + dev_info.pci_dev->addr.function); + else + ret = format (s, "%s%d", device_name, dm->devices[i].device_index); + + if (dm->devices[i].interface_name_suffix) + return format (ret, "/%s", dm->devices[i].interface_name_suffix); + return ret; +} + +static u8 * +format_dpdk_device_type (u8 * s, va_list * args) +{ + dpdk_main_t *dm = &dpdk_main; + char *dev_type; + u32 i = va_arg (*args, u32); + + switch (dm->devices[i].pmd) + { + case VNET_DPDK_PMD_E1000EM: + dev_type = "Intel 82540EM (e1000)"; + break; + + case VNET_DPDK_PMD_IGB: + dev_type = "Intel e1000"; + break; + + case VNET_DPDK_PMD_I40E: + dev_type = "Intel X710/XL710 Family"; + break; + + case VNET_DPDK_PMD_I40EVF: + dev_type = "Intel X710/XL710 Family VF"; + break; + + case VNET_DPDK_PMD_FM10K: + dev_type = "Intel FM10000 Family Ethernet Switch"; + break; + + case VNET_DPDK_PMD_IGBVF: + dev_type = "Intel e1000 VF"; + break; + + case VNET_DPDK_PMD_VIRTIO: + dev_type = "Red Hat Virtio"; + break; + + case VNET_DPDK_PMD_IXGBEVF: + dev_type = "Intel 82599 VF"; + break; + + case VNET_DPDK_PMD_IXGBE: + dev_type = "Intel 82599"; + break; + + case VNET_DPDK_PMD_ENIC: + dev_type = "Cisco VIC"; + break; + + case VNET_DPDK_PMD_CXGBE: + dev_type = "Chelsio T4/T5"; + break; + + case VNET_DPDK_PMD_MLX4: + dev_type = "Mellanox ConnectX-3 Family"; + break; + + case VNET_DPDK_PMD_MLX5: + dev_type = "Mellanox ConnectX-4 Family"; + break; + + case VNET_DPDK_PMD_VMXNET3: + dev_type = "VMware VMXNET3"; + break; + + case VNET_DPDK_PMD_AF_PACKET: + dev_type = "af_packet"; + break; + + case VNET_DPDK_PMD_BOND: + dev_type = "Ethernet Bonding"; + break; + + case VNET_DPDK_PMD_DPAA2: + dev_type = "NXP DPAA2 Mac"; + break; + + case VNET_DPDK_PMD_VIRTIO_USER: + dev_type = "Virtio User"; + break; + + case VNET_DPDK_PMD_THUNDERX: + dev_type = "Cavium ThunderX"; + break; + + case VNET_DPDK_PMD_VHOST_ETHER: + dev_type = "VhostEthernet"; + break; + + default: + case VNET_DPDK_PMD_UNKNOWN: + dev_type = "### UNKNOWN ###"; + break; + } + + return format (s, dev_type); +} + +static u8 * +format_dpdk_link_status (u8 * s, va_list * args) +{ + dpdk_device_t *xd = va_arg (*args, dpdk_device_t *); + struct rte_eth_link *l = &xd->link; + vnet_main_t *vnm = vnet_get_main (); + vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, xd->hw_if_index); + + s = format (s, "%s ", l->link_status ? "up" : "down"); + if (l->link_status) + { + u32 promisc = rte_eth_promiscuous_get (xd->device_index); + + s = format (s, "%s duplex ", (l->link_duplex == ETH_LINK_FULL_DUPLEX) ? + "full" : "half"); + s = format (s, "speed %u mtu %d %s\n", l->link_speed, + hi->max_packet_bytes, promisc ? " promisc" : ""); + } + else + s = format (s, "\n"); + + return s; +} + +#define _line_len 72 +#define _(v, str) \ +if (bitmap & v) { \ + if (format_get_indent (s) > next_split ) { \ + next_split += _line_len; \ + s = format(s,"\n%U", format_white_space, indent); \ + } \ + s = format(s, "%s ", str); \ +} + +static u8 * +format_dpdk_rss_hf_name (u8 * s, va_list * args) +{ + u64 bitmap = va_arg (*args, u64); + int next_split = _line_len; + int indent = format_get_indent (s); + + if (!bitmap) + return format (s, "none"); + + foreach_dpdk_rss_hf return s; +} + +static u8 * +format_dpdk_rx_offload_caps (u8 * s, va_list * args) +{ + u32 bitmap = va_arg (*args, u32); + int next_split = _line_len; + int indent = format_get_indent (s); + + if (!bitmap) + return format (s, "none"); + + foreach_dpdk_rx_offload_caps return s; +} + +static u8 * +format_dpdk_tx_offload_caps (u8 * s, va_list * args) +{ + u32 bitmap = va_arg (*args, u32); + int next_split = _line_len; + int indent = format_get_indent (s); + if (!bitmap) + return format (s, "none"); + + foreach_dpdk_tx_offload_caps return s; +} + +#undef _line_len +#undef _ + +u8 * +format_dpdk_device_errors (u8 * s, va_list * args) +{ + dpdk_device_t *xd = va_arg (*args, dpdk_device_t *); + clib_error_t *e; + uword indent = format_get_indent (s); + + vec_foreach (e, xd->errors) + { + s = format (s, "%U%v\n", format_white_space, indent, e->what); + } + return s; +} + +u8 * +format_dpdk_device (u8 * s, va_list * args) +{ + u32 dev_instance = va_arg (*args, u32); + int verbose = va_arg (*args, int); + dpdk_main_t *dm = &dpdk_main; + dpdk_device_t *xd = vec_elt_at_index (dm->devices, dev_instance); + uword indent = format_get_indent (s); + f64 now = vlib_time_now (dm->vlib_main); + struct rte_eth_dev_info di; + + dpdk_update_counters (xd, now); + dpdk_update_link_state (xd, now); + + s = format (s, "%U\n%Ucarrier %U", + format_dpdk_device_type, xd->device_index, + format_white_space, indent + 2, format_dpdk_link_status, xd); + + rte_eth_dev_info_get (xd->device_index, &di); + + if (verbose > 1 && xd->flags & DPDK_DEVICE_FLAG_PMD) + { + struct rte_pci_device *pci; + struct rte_eth_rss_conf rss_conf; + int vlan_off; + int retval; + + rss_conf.rss_key = 0; + retval = rte_eth_dev_rss_hash_conf_get (xd->device_index, &rss_conf); + if (retval < 0) + clib_warning ("rte_eth_dev_rss_hash_conf_get returned %d", retval); + pci = di.pci_dev; + + if (pci) + s = + format (s, + "%Upci id: device %04x:%04x subsystem %04x:%04x\n" + "%Upci address: %04x:%02x:%02x.%02x\n", + format_white_space, indent + 2, pci->id.vendor_id, + pci->id.device_id, pci->id.subsystem_vendor_id, + pci->id.subsystem_device_id, format_white_space, indent + 2, + pci->addr.domain, pci->addr.bus, pci->addr.devid, + pci->addr.function); + s = + format (s, "%Umax rx packet len: %d\n", format_white_space, + indent + 2, di.max_rx_pktlen); + s = + format (s, "%Umax num of queues: rx %d tx %d\n", format_white_space, + indent + 2, di.max_rx_queues, di.max_tx_queues); + s = + format (s, "%Upromiscuous: unicast %s all-multicast %s\n", + format_white_space, indent + 2, + rte_eth_promiscuous_get (xd->device_index) ? "on" : "off", + rte_eth_allmulticast_get (xd->device_index) ? "on" : "off"); + vlan_off = rte_eth_dev_get_vlan_offload (xd->device_index); + s = format (s, "%Uvlan offload: strip %s filter %s qinq %s\n", + format_white_space, indent + 2, + vlan_off & ETH_VLAN_STRIP_OFFLOAD ? "on" : "off", + vlan_off & ETH_VLAN_FILTER_OFFLOAD ? "on" : "off", + vlan_off & ETH_VLAN_EXTEND_OFFLOAD ? "on" : "off"); + s = format (s, "%Urx offload caps: %U\n", + format_white_space, indent + 2, + format_dpdk_rx_offload_caps, di.rx_offload_capa); + s = format (s, "%Utx offload caps: %U\n", + format_white_space, indent + 2, + format_dpdk_tx_offload_caps, di.tx_offload_capa); + s = format (s, "%Urss active: %U\n" + "%Urss supported: %U\n", + format_white_space, indent + 2, + format_dpdk_rss_hf_name, rss_conf.rss_hf, + format_white_space, indent + 2, + format_dpdk_rss_hf_name, di.flow_type_rss_offloads); + } + + s = format (s, "%Urx queues %d, rx desc %d, tx queues %d, tx desc %d\n", + format_white_space, indent + 2, + xd->rx_q_used, xd->nb_rx_desc, xd->tx_q_used, xd->nb_tx_desc); + + if (xd->cpu_socket > -1) + s = format (s, "%Ucpu socket %d\n", + format_white_space, indent + 2, xd->cpu_socket); + + /* $$$ MIB counters */ + { +#define _(N, V) \ + if ((xd->stats.V - xd->last_cleared_stats.V) != 0) { \ + s = format (s, "\n%U%-40U%16Ld", \ + format_white_space, indent + 2, \ + format_c_identifier, #N, \ + xd->stats.V - xd->last_cleared_stats.V); \ + } \ + + foreach_dpdk_counter +#undef _ + } + + u8 *xs = 0; + u32 i = 0; + struct rte_eth_xstat *xstat, *last_xstat; + struct rte_eth_xstat_name *xstat_names = 0; + int len = rte_eth_xstats_get_names (xd->device_index, NULL, 0); + vec_validate (xstat_names, len - 1); + rte_eth_xstats_get_names (xd->device_index, xstat_names, len); + + ASSERT (vec_len (xd->xstats) == vec_len (xd->last_cleared_xstats)); + + /* *INDENT-OFF* */ + vec_foreach_index(i, xd->xstats) + { + u64 delta = 0; + xstat = vec_elt_at_index(xd->xstats, i); + last_xstat = vec_elt_at_index(xd->last_cleared_xstats, i); + + delta = xstat->value - last_xstat->value; + if (verbose == 2 || (verbose && delta)) + { + /* format_c_identifier doesn't like c strings inside vector */ + u8 * name = format(0,"%s", xstat_names[i].name); + xs = format(xs, "\n%U%-38U%16Ld", + format_white_space, indent + 4, + format_c_identifier, name, delta); + vec_free(name); + } + } + /* *INDENT-ON* */ + + vec_free (xstat_names); + + if (xs) + { + s = format (s, "\n%Uextended stats:%v", + format_white_space, indent + 2, xs); + vec_free (xs); + } + + if (vec_len (xd->errors)) + { + s = format (s, "%UErrors:\n %U", format_white_space, indent, + format_dpdk_device_errors, xd); + } + + return s; +} + +u8 * +format_dpdk_tx_dma_trace (u8 * s, va_list * va) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *); + CLIB_UNUSED (vnet_main_t * vnm) = vnet_get_main (); + dpdk_tx_dma_trace_t *t = va_arg (*va, dpdk_tx_dma_trace_t *); + dpdk_main_t *dm = &dpdk_main; + dpdk_device_t *xd = vec_elt_at_index (dm->devices, t->device_index); + uword indent = format_get_indent (s); + vnet_sw_interface_t *sw = vnet_get_sw_interface (vnm, xd->vlib_sw_if_index); + + s = format (s, "%U tx queue %d", + format_vnet_sw_interface_name, vnm, sw, t->queue_index); + + s = format (s, "\n%Ubuffer 0x%x: %U", + format_white_space, indent, + t->buffer_index, format_vlib_buffer, &t->buffer); + + s = format (s, "\n%U%U", format_white_space, indent, + format_ethernet_header_with_length, t->buffer.pre_data, + sizeof (t->buffer.pre_data)); + + return s; +} + +u8 * +format_dpdk_rx_dma_trace (u8 * s, va_list * va) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *); + CLIB_UNUSED (vnet_main_t * vnm) = vnet_get_main (); + dpdk_rx_dma_trace_t *t = va_arg (*va, dpdk_rx_dma_trace_t *); + dpdk_main_t *dm = &dpdk_main; + dpdk_device_t *xd = vec_elt_at_index (dm->devices, t->device_index); + format_function_t *f; + uword indent = format_get_indent (s); + vnet_sw_interface_t *sw = vnet_get_sw_interface (vnm, xd->vlib_sw_if_index); + + s = format (s, "%U rx queue %d", + format_vnet_sw_interface_name, vnm, sw, t->queue_index); + + s = format (s, "\n%Ubuffer 0x%x: %U", + format_white_space, indent, + t->buffer_index, format_vlib_buffer, &t->buffer); + + s = format (s, "\n%U%U", + format_white_space, indent, + format_dpdk_rte_mbuf, &t->mb, &t->data); + + if (vm->trace_main.verbose) + { + s = format (s, "\n%UPacket Dump%s", format_white_space, indent + 2, + t->mb.data_len > sizeof (t->data) ? " (truncated)" : ""); + s = format (s, "\n%U%U", format_white_space, indent + 4, + format_hexdump, &t->data, + t->mb.data_len > + sizeof (t->data) ? sizeof (t->data) : t->mb.data_len); + } + f = node->format_buffer; + if (!f) + f = format_hex_bytes; + s = format (s, "\n%U%U", format_white_space, indent, + f, t->buffer.pre_data, sizeof (t->buffer.pre_data)); + + return s; +} + + +static inline u8 * +format_dpdk_pkt_types (u8 * s, va_list * va) +{ + u32 *pkt_types = va_arg (*va, u32 *); + uword indent __attribute__ ((unused)) = format_get_indent (s) + 2; + + if (!*pkt_types) + return s; + + s = format (s, "Packet Types"); + +#define _(L, F, S) \ + if ((*pkt_types & RTE_PTYPE_##L##_MASK) == RTE_PTYPE_##L##_##F) \ + { \ + s = format (s, "\n%U%s (0x%04x) %s", format_white_space, indent, \ + "RTE_PTYPE_" #L "_" #F, RTE_PTYPE_##L##_##F, S); \ + } + + foreach_dpdk_pkt_type +#undef _ + return s; +} + +static inline u8 * +format_dpdk_pkt_offload_flags (u8 * s, va_list * va) +{ + u64 *ol_flags = va_arg (*va, u64 *); + uword indent = format_get_indent (s) + 2; + + if (!*ol_flags) + return s; + + s = format (s, "Packet Offload Flags"); + +#define _(F, S) \ + if (*ol_flags & F) \ + { \ + s = format (s, "\n%U%s (0x%04x) %s", \ + format_white_space, indent, #F, F, S); \ + } + + foreach_dpdk_pkt_offload_flag +#undef _ + return s; +} + +u8 * +format_dpdk_rte_mbuf_vlan (u8 * s, va_list * va) +{ + ethernet_vlan_header_tv_t *vlan_hdr = + va_arg (*va, ethernet_vlan_header_tv_t *); + + if (clib_net_to_host_u16 (vlan_hdr->type) == ETHERNET_TYPE_DOT1AD) + { + s = format (s, "%U 802.1q vlan ", + format_ethernet_vlan_tci, + clib_net_to_host_u16 (vlan_hdr->priority_cfi_and_id)); + vlan_hdr++; + } + + s = format (s, "%U", + format_ethernet_vlan_tci, + clib_net_to_host_u16 (vlan_hdr->priority_cfi_and_id)); + + return s; +} + +u8 * +format_dpdk_rte_mbuf (u8 * s, va_list * va) +{ + struct rte_mbuf *mb = va_arg (*va, struct rte_mbuf *); + ethernet_header_t *eth_hdr = va_arg (*va, ethernet_header_t *); + uword indent = format_get_indent (s) + 2; + + s = format (s, "PKT MBUF: port %d, nb_segs %d, pkt_len %d" + "\n%Ubuf_len %d, data_len %d, ol_flags 0x%x, data_off %d, phys_addr 0x%x" + "\n%Upacket_type 0x%x", + mb->port, mb->nb_segs, mb->pkt_len, + format_white_space, indent, + mb->buf_len, mb->data_len, mb->ol_flags, mb->data_off, + mb->buf_physaddr, format_white_space, indent, mb->packet_type); + + if (mb->ol_flags) + s = format (s, "\n%U%U", format_white_space, indent, + format_dpdk_pkt_offload_flags, &mb->ol_flags); + + if ((mb->ol_flags & PKT_RX_VLAN_PKT) && + ((mb->ol_flags & (PKT_RX_VLAN_STRIPPED | PKT_RX_QINQ_STRIPPED)) == 0)) + { + ethernet_vlan_header_tv_t *vlan_hdr = + ((ethernet_vlan_header_tv_t *) & (eth_hdr->type)); + s = format (s, " %U", format_dpdk_rte_mbuf_vlan, vlan_hdr); + } + + if (mb->packet_type) + s = format (s, "\n%U%U", format_white_space, indent, + format_dpdk_pkt_types, &mb->packet_type); + + return s; +} + +clib_error_t * +unformat_rss_fn (unformat_input_t * input, uword * rss_fn) +{ + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (0) + ; +#undef _ +#define _(f, s) \ + else if (unformat (input, s)) \ + *rss_fn |= f; + + foreach_dpdk_rss_hf +#undef _ + else + { + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + } + return 0; +} + +uword +unformat_dpdk_log_level (unformat_input_t * input, va_list * args) +{ + u32 *r = va_arg (*args, u32 *); + + if (0); +#define _(v,s) else if (unformat (input, s)) *r = RTE_LOG_##v; + foreach_dpdk_log_level +#undef _ + else + return 0; + return 1; +} + +clib_error_t * +unformat_hqos (unformat_input_t * input, dpdk_device_config_hqos_t * hqos) +{ + clib_error_t *error = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "hqos-thread %u", &hqos->hqos_thread)) + hqos->hqos_thread_valid = 1; + else + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + break; + } + } + + return error; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c new file mode 100755 index 00000000..acf712ff --- /dev/null +++ b/src/plugins/dpdk/device/init.c @@ -0,0 +1,1589 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vnet/vnet.h> +#include <vppinfra/vec.h> +#include <vppinfra/error.h> +#include <vppinfra/format.h> +#include <vppinfra/bitmap.h> +#include <vppinfra/linux/sysfs.h> +#include <vlib/unix/unix.h> + +#include <vnet/ethernet/ethernet.h> +#include <dpdk/device/dpdk.h> +#include <vlib/pci/pci.h> + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/stat.h> +#include <sys/mount.h> +#include <string.h> +#include <fcntl.h> + +#include <dpdk/device/dpdk_priv.h> + +dpdk_main_t dpdk_main; + +#define LINK_STATE_ELOGS 0 + +/* Port configuration, mildly modified Intel app values */ + +static struct rte_eth_conf port_conf_template = { + .rxmode = { + .split_hdr_size = 0, + .header_split = 0, /**< Header Split disabled */ + .hw_ip_checksum = 0, /**< IP checksum offload disabled */ + .hw_vlan_filter = 0, /**< VLAN filtering disabled */ + .hw_strip_crc = 0, /**< CRC stripped by hardware */ + }, + .txmode = { + .mq_mode = ETH_MQ_TX_NONE, + }, +}; + +static dpdk_port_type_t +port_type_from_speed_capa (struct rte_eth_dev_info *dev_info) +{ + + if (dev_info->speed_capa & ETH_LINK_SPEED_100G) + return VNET_DPDK_PORT_TYPE_ETH_100G; + else if (dev_info->speed_capa & ETH_LINK_SPEED_50G) + return VNET_DPDK_PORT_TYPE_ETH_50G; + else if (dev_info->speed_capa & ETH_LINK_SPEED_40G) + return VNET_DPDK_PORT_TYPE_ETH_40G; + else if (dev_info->speed_capa & ETH_LINK_SPEED_25G) + return VNET_DPDK_PORT_TYPE_ETH_25G; + else if (dev_info->speed_capa & ETH_LINK_SPEED_10G) + return VNET_DPDK_PORT_TYPE_ETH_10G; + else if (dev_info->speed_capa & ETH_LINK_SPEED_1G) + return VNET_DPDK_PORT_TYPE_ETH_1G; + + return VNET_DPDK_PORT_TYPE_UNKNOWN; +} + + +static u32 +dpdk_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hi, u32 flags) +{ + dpdk_main_t *dm = &dpdk_main; + dpdk_device_t *xd = vec_elt_at_index (dm->devices, hi->dev_instance); + u32 old = 0; + + if (ETHERNET_INTERFACE_FLAG_CONFIG_PROMISC (flags)) + { + old = (xd->flags & DPDK_DEVICE_FLAG_PROMISC) != 0; + + if (flags & ETHERNET_INTERFACE_FLAG_ACCEPT_ALL) + xd->flags |= DPDK_DEVICE_FLAG_PROMISC; + else + xd->flags &= ~DPDK_DEVICE_FLAG_PROMISC; + + if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) + { + if (xd->flags & DPDK_DEVICE_FLAG_PROMISC) + rte_eth_promiscuous_enable (xd->device_index); + else + rte_eth_promiscuous_disable (xd->device_index); + } + } + else if (ETHERNET_INTERFACE_FLAG_CONFIG_MTU (flags)) + { + int rv; + + xd->port_conf.rxmode.max_rx_pkt_len = hi->max_packet_bytes; + + if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) + dpdk_device_stop (xd); + + rv = rte_eth_dev_configure + (xd->device_index, xd->rx_q_used, xd->tx_q_used, &xd->port_conf); + + if (rv < 0) + vlib_cli_output (vlib_get_main (), + "rte_eth_dev_configure[%d]: err %d", + xd->device_index, rv); + + rte_eth_dev_set_mtu (xd->device_index, hi->max_packet_bytes); + + if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) + dpdk_device_start (xd); + + } + return old; +} + +static void +dpdk_device_lock_init (dpdk_device_t * xd) +{ + int q; + vec_validate (xd->lockp, xd->tx_q_used - 1); + for (q = 0; q < xd->tx_q_used; q++) + { + xd->lockp[q] = clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES, + CLIB_CACHE_LINE_BYTES); + memset ((void *) xd->lockp[q], 0, CLIB_CACHE_LINE_BYTES); + } +} + +static clib_error_t * +dpdk_lib_init (dpdk_main_t * dm) +{ + u32 nports; + u32 nb_desc = 0; + int i; + clib_error_t *error; + vlib_main_t *vm = vlib_get_main (); + vlib_thread_main_t *tm = vlib_get_thread_main (); + vnet_device_main_t *vdm = &vnet_device_main; + vnet_sw_interface_t *sw; + vnet_hw_interface_t *hi; + dpdk_device_t *xd; + vlib_pci_addr_t last_pci_addr; + u32 last_pci_addr_port = 0; + vlib_thread_registration_t *tr_hqos; + uword *p_hqos; + + u32 next_hqos_cpu = 0; + u8 af_packet_port_id = 0; + u8 bond_ether_port_id = 0; + last_pci_addr.as_u32 = ~0; + + dm->hqos_cpu_first_index = 0; + dm->hqos_cpu_count = 0; + + /* find out which cpus will be used for I/O TX */ + p_hqos = hash_get_mem (tm->thread_registrations_by_name, "hqos-threads"); + tr_hqos = p_hqos ? (vlib_thread_registration_t *) p_hqos[0] : 0; + + if (tr_hqos && tr_hqos->count > 0) + { + dm->hqos_cpu_first_index = tr_hqos->first_index; + dm->hqos_cpu_count = tr_hqos->count; + } + + vec_validate_aligned (dm->devices_by_hqos_cpu, tm->n_vlib_mains - 1, + CLIB_CACHE_LINE_BYTES); + + nports = rte_eth_dev_count (); + if (nports < 1) + { + clib_warning ("DPDK drivers found no ports..."); + } + + if (CLIB_DEBUG > 0) + clib_warning ("DPDK drivers found %d ports...", nports); + + /* + * All buffers are all allocated from the same rte_mempool. + * Thus they all have the same number of data bytes. + */ + dm->vlib_buffer_free_list_index = + vlib_buffer_get_or_create_free_list (vm, + VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES, + "dpdk rx"); + + if (dm->conf->enable_tcp_udp_checksum) + dm->buffer_flags_template &= ~(VNET_BUFFER_F_L4_CHECKSUM_CORRECT + | VNET_BUFFER_F_L4_CHECKSUM_COMPUTED); + + /* vlib_buffer_t template */ + vec_validate_aligned (dm->buffer_templates, tm->n_vlib_mains - 1, + CLIB_CACHE_LINE_BYTES); + for (i = 0; i < tm->n_vlib_mains; i++) + { + vlib_buffer_free_list_t *fl; + vlib_buffer_t *bt = vec_elt_at_index (dm->buffer_templates, i); + fl = vlib_buffer_get_free_list (vm, + VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + vlib_buffer_init_for_free_list (bt, fl); + bt->flags = dm->buffer_flags_template; + bt->current_data = -RTE_PKTMBUF_HEADROOM; + vnet_buffer (bt)->sw_if_index[VLIB_TX] = (u32) ~ 0; + } + + for (i = 0; i < nports; i++) + { + u8 addr[6]; + u8 vlan_strip = 0; + int j; + struct rte_eth_dev_info dev_info; + struct rte_eth_link l; + dpdk_device_config_t *devconf = 0; + vlib_pci_addr_t pci_addr; + uword *p = 0; + + rte_eth_dev_info_get (i, &dev_info); + if (dev_info.pci_dev) /* bonded interface has no pci info */ + { + pci_addr.domain = dev_info.pci_dev->addr.domain; + pci_addr.bus = dev_info.pci_dev->addr.bus; + pci_addr.slot = dev_info.pci_dev->addr.devid; + pci_addr.function = dev_info.pci_dev->addr.function; + p = + hash_get (dm->conf->device_config_index_by_pci_addr, + pci_addr.as_u32); + } + + if (p) + devconf = pool_elt_at_index (dm->conf->dev_confs, p[0]); + else + devconf = &dm->conf->default_devconf; + + /* Create vnet interface */ + vec_add2_aligned (dm->devices, xd, 1, CLIB_CACHE_LINE_BYTES); + xd->nb_rx_desc = DPDK_NB_RX_DESC_DEFAULT; + xd->nb_tx_desc = DPDK_NB_TX_DESC_DEFAULT; + xd->cpu_socket = (i8) rte_eth_dev_socket_id (i); + + /* Handle interface naming for devices with multiple ports sharing same PCI ID */ + if (dev_info.pci_dev) + { + struct rte_eth_dev_info di = { 0 }; + rte_eth_dev_info_get (i + 1, &di); + if (di.pci_dev && pci_addr.as_u32 != last_pci_addr.as_u32 && + memcmp (&dev_info.pci_dev->addr, &di.pci_dev->addr, + sizeof (struct rte_pci_addr)) == 0) + { + xd->interface_name_suffix = format (0, "0"); + last_pci_addr.as_u32 = pci_addr.as_u32; + last_pci_addr_port = i; + } + else if (pci_addr.as_u32 == last_pci_addr.as_u32) + { + xd->interface_name_suffix = + format (0, "%u", i - last_pci_addr_port); + } + else + { + last_pci_addr.as_u32 = ~0; + } + } + else + last_pci_addr.as_u32 = ~0; + + clib_memcpy (&xd->tx_conf, &dev_info.default_txconf, + sizeof (struct rte_eth_txconf)); + if (dm->conf->no_multi_seg) + { + xd->tx_conf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS; + port_conf_template.rxmode.jumbo_frame = 0; + port_conf_template.rxmode.enable_scatter = 0; + } + else + { + xd->tx_conf.txq_flags &= ~ETH_TXQ_FLAGS_NOMULTSEGS; + port_conf_template.rxmode.jumbo_frame = 1; + port_conf_template.rxmode.enable_scatter = 1; + xd->flags |= DPDK_DEVICE_FLAG_MAYBE_MULTISEG; + } + + clib_memcpy (&xd->port_conf, &port_conf_template, + sizeof (struct rte_eth_conf)); + + xd->tx_q_used = clib_min (dev_info.max_tx_queues, tm->n_vlib_mains); + + if (devconf->num_tx_queues > 0 + && devconf->num_tx_queues < xd->tx_q_used) + xd->tx_q_used = clib_min (xd->tx_q_used, devconf->num_tx_queues); + + if (devconf->num_rx_queues > 1 && dm->use_rss == 0) + { + dm->use_rss = 1; + } + + if (devconf->num_rx_queues > 1 + && dev_info.max_rx_queues >= devconf->num_rx_queues) + { + xd->rx_q_used = devconf->num_rx_queues; + xd->port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; + if (devconf->rss_fn == 0) + xd->port_conf.rx_adv_conf.rss_conf.rss_hf = + ETH_RSS_IP | ETH_RSS_UDP | ETH_RSS_TCP; + else + xd->port_conf.rx_adv_conf.rss_conf.rss_hf = devconf->rss_fn; + } + else + xd->rx_q_used = 1; + + xd->flags |= DPDK_DEVICE_FLAG_PMD; + + /* workaround for drivers not setting driver_name */ + if ((!dev_info.driver_name) && (dev_info.pci_dev)) + dev_info.driver_name = dev_info.pci_dev->driver->driver.name; + + ASSERT (dev_info.driver_name); + + if (!xd->pmd) + { + + +#define _(s,f) else if (dev_info.driver_name && \ + !strcmp(dev_info.driver_name, s)) \ + xd->pmd = VNET_DPDK_PMD_##f; + if (0) + ; + foreach_dpdk_pmd +#undef _ + else + xd->pmd = VNET_DPDK_PMD_UNKNOWN; + + xd->port_type = VNET_DPDK_PORT_TYPE_UNKNOWN; + xd->nb_rx_desc = DPDK_NB_RX_DESC_DEFAULT; + xd->nb_tx_desc = DPDK_NB_TX_DESC_DEFAULT; + + switch (xd->pmd) + { + /* Drivers with valid speed_capa set */ + case VNET_DPDK_PMD_E1000EM: + case VNET_DPDK_PMD_IGB: + case VNET_DPDK_PMD_IXGBE: + case VNET_DPDK_PMD_I40E: + xd->port_type = port_type_from_speed_capa (&dev_info); + xd->flags |= DPDK_DEVICE_FLAG_TX_OFFLOAD | + DPDK_DEVICE_FLAG_INTEL_PHDR_CKSUM; + + break; + case VNET_DPDK_PMD_CXGBE: + case VNET_DPDK_PMD_MLX4: + case VNET_DPDK_PMD_MLX5: + xd->port_type = port_type_from_speed_capa (&dev_info); + break; + + /* SR-IOV VFs */ + case VNET_DPDK_PMD_IGBVF: + case VNET_DPDK_PMD_IXGBEVF: + case VNET_DPDK_PMD_I40EVF: + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_VF; + xd->port_conf.rxmode.hw_strip_crc = 1; + break; + + case VNET_DPDK_PMD_THUNDERX: + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_VF; + xd->port_conf.rxmode.hw_strip_crc = 1; + break; + + case VNET_DPDK_PMD_DPAA2: + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G; + break; + + /* Cisco VIC */ + case VNET_DPDK_PMD_ENIC: + rte_eth_link_get_nowait (i, &l); + if (l.link_speed == 40000) + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G; + else + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G; + break; + + /* Intel Red Rock Canyon */ + case VNET_DPDK_PMD_FM10K: + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_SWITCH; + xd->port_conf.rxmode.hw_strip_crc = 1; + break; + + /* virtio */ + case VNET_DPDK_PMD_VIRTIO: + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_1G; + xd->nb_rx_desc = DPDK_NB_RX_DESC_VIRTIO; + xd->nb_tx_desc = DPDK_NB_TX_DESC_VIRTIO; + break; + + /* vmxnet3 */ + case VNET_DPDK_PMD_VMXNET3: + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_1G; + xd->tx_conf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS; + break; + + case VNET_DPDK_PMD_AF_PACKET: + xd->port_type = VNET_DPDK_PORT_TYPE_AF_PACKET; + xd->port_id = af_packet_port_id++; + break; + + case VNET_DPDK_PMD_BOND: + xd->port_type = VNET_DPDK_PORT_TYPE_ETH_BOND; + xd->port_id = bond_ether_port_id++; + break; + + case VNET_DPDK_PMD_VIRTIO_USER: + xd->port_type = VNET_DPDK_PORT_TYPE_VIRTIO_USER; + break; + + case VNET_DPDK_PMD_VHOST_ETHER: + xd->port_type = VNET_DPDK_PORT_TYPE_VHOST_ETHER; + break; + + default: + xd->port_type = VNET_DPDK_PORT_TYPE_UNKNOWN; + } + + if (devconf->num_rx_desc) + xd->nb_rx_desc = devconf->num_rx_desc; + + if (devconf->num_tx_desc) + xd->nb_tx_desc = devconf->num_tx_desc; + } + + /* + * Ensure default mtu is not > the mtu read from the hardware. + * Otherwise rte_eth_dev_configure() will fail and the port will + * not be available. + */ + if (ETHERNET_MAX_PACKET_BYTES > dev_info.max_rx_pktlen) + { + /* + * This device does not support the platforms's max frame + * size. Use it's advertised mru instead. + */ + xd->port_conf.rxmode.max_rx_pkt_len = dev_info.max_rx_pktlen; + } + else + { + xd->port_conf.rxmode.max_rx_pkt_len = ETHERNET_MAX_PACKET_BYTES; + + /* + * Some platforms do not account for Ethernet FCS (4 bytes) in + * MTU calculations. To interop with them increase mru but only + * if the device's settings can support it. + */ + if ((dev_info.max_rx_pktlen >= (ETHERNET_MAX_PACKET_BYTES + 4)) && + xd->port_conf.rxmode.hw_strip_crc) + { + /* + * Allow additional 4 bytes (for Ethernet FCS). These bytes are + * stripped by h/w and so will not consume any buffer memory. + */ + xd->port_conf.rxmode.max_rx_pkt_len += 4; + } + } + + if (xd->pmd == VNET_DPDK_PMD_AF_PACKET) + { + f64 now = vlib_time_now (vm); + u32 rnd; + rnd = (u32) (now * 1e6); + rnd = random_u32 (&rnd); + clib_memcpy (addr + 2, &rnd, sizeof (rnd)); + addr[0] = 2; + addr[1] = 0xfe; + } + else + rte_eth_macaddr_get (i, (struct ether_addr *) addr); + + if (xd->tx_q_used < tm->n_vlib_mains) + dpdk_device_lock_init (xd); + + xd->device_index = xd - dm->devices; + ASSERT (i == xd->device_index); + xd->per_interface_next_index = ~0; + + /* assign interface to input thread */ + dpdk_device_and_queue_t *dq; + int q; + + if (devconf->hqos_enabled) + { + xd->flags |= DPDK_DEVICE_FLAG_HQOS; + + if (devconf->hqos.hqos_thread_valid) + { + int cpu = dm->hqos_cpu_first_index + devconf->hqos.hqos_thread; + + if (devconf->hqos.hqos_thread >= dm->hqos_cpu_count) + return clib_error_return (0, "invalid HQoS thread index"); + + vec_add2 (dm->devices_by_hqos_cpu[cpu], dq, 1); + dq->device = xd->device_index; + dq->queue_id = 0; + } + else + { + int cpu = dm->hqos_cpu_first_index + next_hqos_cpu; + + if (dm->hqos_cpu_count == 0) + return clib_error_return (0, "no HQoS threads available"); + + vec_add2 (dm->devices_by_hqos_cpu[cpu], dq, 1); + dq->device = xd->device_index; + dq->queue_id = 0; + + next_hqos_cpu++; + if (next_hqos_cpu == dm->hqos_cpu_count) + next_hqos_cpu = 0; + + devconf->hqos.hqos_thread_valid = 1; + devconf->hqos.hqos_thread = cpu; + } + } + + vec_validate_aligned (xd->tx_vectors, tm->n_vlib_mains, + CLIB_CACHE_LINE_BYTES); + for (j = 0; j < tm->n_vlib_mains; j++) + { + vec_validate_ha (xd->tx_vectors[j], xd->nb_tx_desc, + sizeof (tx_ring_hdr_t), CLIB_CACHE_LINE_BYTES); + vec_reset_length (xd->tx_vectors[j]); + } + + vec_validate_aligned (xd->rx_vectors, xd->rx_q_used, + CLIB_CACHE_LINE_BYTES); + for (j = 0; j < xd->rx_q_used; j++) + { + vec_validate_aligned (xd->rx_vectors[j], VLIB_FRAME_SIZE - 1, + CLIB_CACHE_LINE_BYTES); + vec_reset_length (xd->rx_vectors[j]); + } + + vec_validate_aligned (xd->d_trace_buffers, tm->n_vlib_mains, + CLIB_CACHE_LINE_BYTES); + + + /* count the number of descriptors used for this device */ + nb_desc += xd->nb_rx_desc + xd->nb_tx_desc * xd->tx_q_used; + + error = ethernet_register_interface + (dm->vnet_main, dpdk_device_class.index, xd->device_index, + /* ethernet address */ addr, + &xd->hw_if_index, dpdk_flag_change); + if (error) + return error; + + sw = vnet_get_hw_sw_interface (dm->vnet_main, xd->hw_if_index); + xd->vlib_sw_if_index = sw->sw_if_index; + vnet_hw_interface_set_input_node (dm->vnet_main, xd->hw_if_index, + dpdk_input_node.index); + + if (devconf->workers) + { + int i; + q = 0; + /* *INDENT-OFF* */ + clib_bitmap_foreach (i, devconf->workers, ({ + vnet_hw_interface_assign_rx_thread (dm->vnet_main, xd->hw_if_index, q++, + vdm->first_worker_thread_index + i); + })); + /* *INDENT-ON* */ + } + else + for (q = 0; q < xd->rx_q_used; q++) + { + vnet_hw_interface_assign_rx_thread (dm->vnet_main, xd->hw_if_index, q, /* any */ + ~1); + } + + hi = vnet_get_hw_interface (dm->vnet_main, xd->hw_if_index); + + if (xd->flags & DPDK_DEVICE_FLAG_TX_OFFLOAD) + hi->flags |= VNET_HW_INTERFACE_FLAG_SUPPORTS_TX_L4_CKSUM_OFFLOAD; + + dpdk_device_setup (xd); + + if (vec_len (xd->errors)) + clib_warning ("setup failed for device %U. Errors:\n %U", + format_dpdk_device_name, i, + format_dpdk_device_errors, xd); + + if (devconf->hqos_enabled) + { + clib_error_t *rv; + rv = dpdk_port_setup_hqos (xd, &devconf->hqos); + if (rv) + return rv; + } + + /* + * For cisco VIC vNIC, set default to VLAN strip enabled, unless + * specified otherwise in the startup config. + * For other NICs default to VLAN strip disabled, unless specified + * otherwis in the startup config. + */ + if (xd->pmd == VNET_DPDK_PMD_ENIC) + { + if (devconf->vlan_strip_offload != DPDK_DEVICE_VLAN_STRIP_OFF) + vlan_strip = 1; /* remove vlan tag from VIC port by default */ + else + clib_warning ("VLAN strip disabled for interface\n"); + } + else if (devconf->vlan_strip_offload == DPDK_DEVICE_VLAN_STRIP_ON) + vlan_strip = 1; + + if (vlan_strip) + { + int vlan_off; + vlan_off = rte_eth_dev_get_vlan_offload (xd->device_index); + vlan_off |= ETH_VLAN_STRIP_OFFLOAD; + xd->port_conf.rxmode.hw_vlan_strip = vlan_off; + if (rte_eth_dev_set_vlan_offload (xd->device_index, vlan_off) == 0) + clib_warning ("VLAN strip enabled for interface\n"); + else + clib_warning ("VLAN strip cannot be supported by interface\n"); + } + + hi->max_l3_packet_bytes[VLIB_RX] = hi->max_l3_packet_bytes[VLIB_TX] = + xd->port_conf.rxmode.max_rx_pkt_len - sizeof (ethernet_header_t); + + rte_eth_dev_set_mtu (xd->device_index, hi->max_packet_bytes); + } + + if (nb_desc > dm->conf->num_mbufs) + clib_warning ("%d mbufs allocated but total rx/tx ring size is %d\n", + dm->conf->num_mbufs, nb_desc); + + return 0; +} + +static void +dpdk_bind_devices_to_uio (dpdk_config_main_t * conf) +{ + vlib_pci_main_t *pm = &pci_main; + clib_error_t *error; + vlib_pci_device_t *d; + u8 *pci_addr = 0; + int num_whitelisted = vec_len (conf->dev_confs); + + /* *INDENT-OFF* */ + pool_foreach (d, pm->pci_devs, ({ + dpdk_device_config_t * devconf = 0; + vec_reset_length (pci_addr); + pci_addr = format (pci_addr, "%U%c", format_vlib_pci_addr, &d->bus_address, 0); + + if (d->device_class != PCI_CLASS_NETWORK_ETHERNET && d->device_class != PCI_CLASS_PROCESSOR_CO) + continue; + + if (num_whitelisted) + { + uword * p = hash_get (conf->device_config_index_by_pci_addr, d->bus_address.as_u32); + + if (!p) + continue; + + devconf = pool_elt_at_index (conf->dev_confs, p[0]); + } + + /* virtio */ + if (d->vendor_id == 0x1af4 && d->device_id == 0x1000) + ; + /* vmxnet3 */ + else if (d->vendor_id == 0x15ad && d->device_id == 0x07b0) + ; + /* all Intel network devices */ + else if (d->vendor_id == 0x8086 && d->device_class == PCI_CLASS_NETWORK_ETHERNET) + ; + /* all Intel QAT devices VFs */ + else if (d->vendor_id == 0x8086 && d->device_class == PCI_CLASS_PROCESSOR_CO && + (d->device_id == 0x0443 || d->device_id == 0x37c9 || d->device_id == 0x19e3)) + ; + /* Cisco VIC */ + else if (d->vendor_id == 0x1137 && d->device_id == 0x0043) + ; + /* Chelsio T4/T5 */ + else if (d->vendor_id == 0x1425 && (d->device_id & 0xe000) == 0x4000) + ; + /* Mellanox */ + else if (d->vendor_id == 0x15b3 && d->device_id >= 0x1013 && d->device_id <= 0x101a) + { + continue; + } + else + { + clib_warning ("Unsupported PCI device 0x%04x:0x%04x found " + "at PCI address %s\n", (u16) d->vendor_id, (u16) d->device_id, + pci_addr); + continue; + } + + error = vlib_pci_bind_to_uio (d, (char *) conf->uio_driver_name); + + if (error) + { + if (devconf == 0) + { + pool_get (conf->dev_confs, devconf); + hash_set (conf->device_config_index_by_pci_addr, d->bus_address.as_u32, + devconf - conf->dev_confs); + devconf->pci_addr.as_u32 = d->bus_address.as_u32; + } + devconf->is_blacklisted = 1; + clib_error_report (error); + } + })); + /* *INDENT-ON* */ + vec_free (pci_addr); +} + +static clib_error_t * +dpdk_device_config (dpdk_config_main_t * conf, vlib_pci_addr_t pci_addr, + unformat_input_t * input, u8 is_default) +{ + clib_error_t *error = 0; + uword *p; + dpdk_device_config_t *devconf; + unformat_input_t sub_input; + + if (is_default) + { + devconf = &conf->default_devconf; + } + else + { + p = hash_get (conf->device_config_index_by_pci_addr, pci_addr.as_u32); + + if (!p) + { + pool_get (conf->dev_confs, devconf); + hash_set (conf->device_config_index_by_pci_addr, pci_addr.as_u32, + devconf - conf->dev_confs); + } + else + return clib_error_return (0, + "duplicate configuration for PCI address %U", + format_vlib_pci_addr, &pci_addr); + } + + devconf->pci_addr.as_u32 = pci_addr.as_u32; + devconf->hqos_enabled = 0; + dpdk_device_config_hqos_default (&devconf->hqos); + + if (!input) + return 0; + + unformat_skip_white_space (input); + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "num-rx-queues %u", &devconf->num_rx_queues)) + ; + else if (unformat (input, "num-tx-queues %u", &devconf->num_tx_queues)) + ; + else if (unformat (input, "num-rx-desc %u", &devconf->num_rx_desc)) + ; + else if (unformat (input, "num-tx-desc %u", &devconf->num_tx_desc)) + ; + else if (unformat (input, "workers %U", unformat_bitmap_list, + &devconf->workers)) + ; + else + if (unformat + (input, "rss %U", unformat_vlib_cli_sub_input, &sub_input)) + { + error = unformat_rss_fn (&sub_input, &devconf->rss_fn); + if (error) + break; + } + else if (unformat (input, "vlan-strip-offload off")) + devconf->vlan_strip_offload = DPDK_DEVICE_VLAN_STRIP_OFF; + else if (unformat (input, "vlan-strip-offload on")) + devconf->vlan_strip_offload = DPDK_DEVICE_VLAN_STRIP_ON; + else + if (unformat + (input, "hqos %U", unformat_vlib_cli_sub_input, &sub_input)) + { + devconf->hqos_enabled = 1; + error = unformat_hqos (&sub_input, &devconf->hqos); + if (error) + break; + } + else if (unformat (input, "hqos")) + { + devconf->hqos_enabled = 1; + } + else + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + break; + } + } + + if (error) + return error; + + if (devconf->workers && devconf->num_rx_queues == 0) + devconf->num_rx_queues = clib_bitmap_count_set_bits (devconf->workers); + else if (devconf->workers && + clib_bitmap_count_set_bits (devconf->workers) != + devconf->num_rx_queues) + error = + clib_error_return (0, + "%U: number of worker threadds must be " + "equal to number of rx queues", format_vlib_pci_addr, + &pci_addr); + + return error; +} + +static clib_error_t * +dpdk_config (vlib_main_t * vm, unformat_input_t * input) +{ + clib_error_t *error = 0; + dpdk_main_t *dm = &dpdk_main; + dpdk_config_main_t *conf = &dpdk_config_main; + vlib_thread_main_t *tm = vlib_get_thread_main (); + dpdk_device_config_t *devconf; + vlib_pci_addr_t pci_addr; + unformat_input_t sub_input; + uword x; + u8 *s, *tmp = 0; + u8 *rte_cmd = 0, *ethname = 0; + u32 log_level; + int ret, i; + int num_whitelisted = 0; + u8 no_pci = 0; + u8 no_huge = 0; + u8 huge_dir = 0; + u8 file_prefix = 0; + u8 *socket_mem = 0; + u8 *huge_dir_path = 0; + + huge_dir_path = + format (0, "%s/hugepages%c", vlib_unix_get_runtime_dir (), 0); + + conf->device_config_index_by_pci_addr = hash_create (0, sizeof (uword)); + log_level = RTE_LOG_NOTICE; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + /* Prime the pump */ + if (unformat (input, "no-hugetlb")) + { + vec_add1 (conf->eal_init_args, (u8 *) "no-huge"); + no_huge = 1; + } + + else if (unformat (input, "enable-tcp-udp-checksum")) + conf->enable_tcp_udp_checksum = 1; + + else if (unformat (input, "decimal-interface-names")) + conf->interface_name_format_decimal = 1; + + else if (unformat (input, "log-level %U", unformat_dpdk_log_level, &x)) + log_level = x; + + else if (unformat (input, "no-multi-seg")) + conf->no_multi_seg = 1; + + else if (unformat (input, "dev default %U", unformat_vlib_cli_sub_input, + &sub_input)) + { + error = + dpdk_device_config (conf, (vlib_pci_addr_t) (u32) ~ 1, &sub_input, + 1); + + if (error) + return error; + } + else + if (unformat + (input, "dev %U %U", unformat_vlib_pci_addr, &pci_addr, + unformat_vlib_cli_sub_input, &sub_input)) + { + error = dpdk_device_config (conf, pci_addr, &sub_input, 0); + + if (error) + return error; + + num_whitelisted++; + } + else if (unformat (input, "dev %U", unformat_vlib_pci_addr, &pci_addr)) + { + error = dpdk_device_config (conf, pci_addr, 0, 0); + + if (error) + return error; + + num_whitelisted++; + } + else if (unformat (input, "num-mbufs %d", &conf->num_mbufs)) + ; + else if (unformat (input, "uio-driver %s", &conf->uio_driver_name)) + ; + else if (unformat (input, "socket-mem %s", &socket_mem)) + ; + else if (unformat (input, "no-pci")) + { + no_pci = 1; + tmp = format (0, "--no-pci%c", 0); + vec_add1 (conf->eal_init_args, tmp); + } + else if (unformat (input, "poll-sleep %d", &dm->poll_sleep_usec)) + ; + +#define _(a) \ + else if (unformat(input, #a)) \ + { \ + tmp = format (0, "--%s%c", #a, 0); \ + vec_add1 (conf->eal_init_args, tmp); \ + } + foreach_eal_double_hyphen_predicate_arg +#undef _ +#define _(a) \ + else if (unformat(input, #a " %s", &s)) \ + { \ + if (!strncmp(#a, "huge-dir", 8)) \ + huge_dir = 1; \ + else if (!strncmp(#a, "file-prefix", 11)) \ + file_prefix = 1; \ + tmp = format (0, "--%s%c", #a, 0); \ + vec_add1 (conf->eal_init_args, tmp); \ + vec_add1 (s, 0); \ + if (!strncmp(#a, "vdev", 4)) \ + if (strstr((char*)s, "af_packet")) \ + clib_warning ("af_packet obsoleted. Use CLI 'create host-interface'."); \ + vec_add1 (conf->eal_init_args, s); \ + } + foreach_eal_double_hyphen_arg +#undef _ +#define _(a,b) \ + else if (unformat(input, #a " %s", &s)) \ + { \ + tmp = format (0, "-%s%c", #b, 0); \ + vec_add1 (conf->eal_init_args, tmp); \ + vec_add1 (s, 0); \ + vec_add1 (conf->eal_init_args, s); \ + } + foreach_eal_single_hyphen_arg +#undef _ +#define _(a,b) \ + else if (unformat(input, #a " %s", &s)) \ + { \ + tmp = format (0, "-%s%c", #b, 0); \ + vec_add1 (conf->eal_init_args, tmp); \ + vec_add1 (s, 0); \ + vec_add1 (conf->eal_init_args, s); \ + conf->a##_set_manually = 1; \ + } + foreach_eal_single_hyphen_mandatory_arg +#undef _ + else if (unformat (input, "default")) + ; + + else if (unformat_skip_white_space (input)) + ; + else + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + goto done; + } + } + + if (!conf->uio_driver_name) + conf->uio_driver_name = format (0, "uio_pci_generic%c", 0); + + /* + * Use 1G huge pages if available. + */ + if (!no_huge && !huge_dir) + { + u32 x, *mem_by_socket = 0; + uword c = 0; + u8 use_1g = 1; + u8 use_2m = 1; + u8 less_than_1g = 1; + int rv; + + umount ((char *) huge_dir_path); + + /* Process "socket-mem" parameter value */ + if (vec_len (socket_mem)) + { + unformat_input_t in; + unformat_init_vector (&in, socket_mem); + while (unformat_check_input (&in) != UNFORMAT_END_OF_INPUT) + { + if (unformat (&in, "%u,", &x)) + ; + else if (unformat (&in, "%u", &x)) + ; + else if (unformat (&in, ",")) + x = 0; + else + break; + + vec_add1 (mem_by_socket, x); + + if (x > 1023) + less_than_1g = 0; + } + /* Note: unformat_free vec_frees(in.buffer), aka socket_mem... */ + unformat_free (&in); + socket_mem = 0; + } + else + { + /* *INDENT-OFF* */ + clib_bitmap_foreach (c, tm->cpu_socket_bitmap, ( + { + vec_validate(mem_by_socket, c); + mem_by_socket[c] = 256; /* default per-socket mem */ + } + )); + /* *INDENT-ON* */ + } + + /* check if available enough 1GB pages for each socket */ + /* *INDENT-OFF* */ + clib_bitmap_foreach (c, tm->cpu_socket_bitmap, ( + { + int pages_avail, page_size, mem; + clib_error_t *e = 0; + + vec_validate(mem_by_socket, c); + mem = mem_by_socket[c]; + + page_size = 1024; + e = clib_sysfs_get_free_hugepages(c, page_size * 1024, &pages_avail); + + if (e != 0 || pages_avail < 0 || page_size * pages_avail < mem) + use_1g = 0; + + if (e) + clib_error_free (e); + + page_size = 2; + e = clib_sysfs_get_free_hugepages(c, page_size * 1024, &pages_avail); + + if (e != 0 || pages_avail < 0 || page_size * pages_avail < mem) + use_2m = 0; + + if (e) + clib_error_free (e); + })); + /* *INDENT-ON* */ + + if (mem_by_socket == 0) + { + error = clib_error_return (0, "mem_by_socket NULL"); + goto done; + } + _vec_len (mem_by_socket) = c + 1; + + /* regenerate socket_mem string */ + vec_foreach_index (x, mem_by_socket) + socket_mem = format (socket_mem, "%s%u", + socket_mem ? "," : "", mem_by_socket[x]); + socket_mem = format (socket_mem, "%c", 0); + + vec_free (mem_by_socket); + + error = vlib_unix_recursive_mkdir ((char *) huge_dir_path); + if (error) + { + goto done; + } + + if (use_1g && !(less_than_1g && use_2m)) + { + rv = mount ("none", (char *) huge_dir_path, "hugetlbfs", 0, + "pagesize=1G"); + } + else if (use_2m) + { + rv = mount ("none", (char *) huge_dir_path, "hugetlbfs", 0, NULL); + } + else + { + return clib_error_return (0, "not enough free huge pages"); + } + + if (rv) + { + error = clib_error_return (0, "mount failed %d", errno); + goto done; + } + + tmp = format (0, "--huge-dir%c", 0); + vec_add1 (conf->eal_init_args, tmp); + tmp = format (0, "%s%c", huge_dir_path, 0); + vec_add1 (conf->eal_init_args, tmp); + if (!file_prefix) + { + tmp = format (0, "--file-prefix%c", 0); + vec_add1 (conf->eal_init_args, tmp); + tmp = format (0, "vpp%c", 0); + vec_add1 (conf->eal_init_args, tmp); + } + } + + vec_free (rte_cmd); + vec_free (ethname); + + if (error) + return error; + + /* I'll bet that -c and -n must be the first and second args... */ + if (!conf->coremask_set_manually) + { + vlib_thread_registration_t *tr; + uword *coremask = 0; + int i; + + /* main thread core */ + coremask = clib_bitmap_set (coremask, tm->main_lcore, 1); + + for (i = 0; i < vec_len (tm->registrations); i++) + { + tr = tm->registrations[i]; + coremask = clib_bitmap_or (coremask, tr->coremask); + } + + vec_insert (conf->eal_init_args, 2, 1); + conf->eal_init_args[1] = (u8 *) "-c"; + tmp = format (0, "%U%c", format_bitmap_hex, coremask, 0); + conf->eal_init_args[2] = tmp; + clib_bitmap_free (coremask); + } + + if (!conf->nchannels_set_manually) + { + vec_insert (conf->eal_init_args, 2, 3); + conf->eal_init_args[3] = (u8 *) "-n"; + tmp = format (0, "%d", conf->nchannels); + conf->eal_init_args[4] = tmp; + } + + if (no_pci == 0 && geteuid () == 0) + dpdk_bind_devices_to_uio (conf); + +#define _(x) \ + if (devconf->x == 0 && conf->default_devconf.x > 0) \ + devconf->x = conf->default_devconf.x ; + + /* *INDENT-OFF* */ + pool_foreach (devconf, conf->dev_confs, ({ + + /* default per-device config items */ + foreach_dpdk_device_config_item + + /* add DPDK EAL whitelist/blacklist entry */ + if (num_whitelisted > 0 && devconf->is_blacklisted == 0) + { + tmp = format (0, "-w%c", 0); + vec_add1 (conf->eal_init_args, tmp); + tmp = format (0, "%U%c", format_vlib_pci_addr, &devconf->pci_addr, 0); + vec_add1 (conf->eal_init_args, tmp); + } + else if (num_whitelisted == 0 && devconf->is_blacklisted != 0) + { + tmp = format (0, "-b%c", 0); + vec_add1 (conf->eal_init_args, tmp); + tmp = format (0, "%U%c", format_vlib_pci_addr, &devconf->pci_addr, 0); + vec_add1 (conf->eal_init_args, tmp); + } + })); + /* *INDENT-ON* */ + +#undef _ + + /* set master-lcore */ + tmp = format (0, "--master-lcore%c", 0); + vec_add1 (conf->eal_init_args, tmp); + tmp = format (0, "%u%c", tm->main_lcore, 0); + vec_add1 (conf->eal_init_args, tmp); + + /* set socket-mem */ + tmp = format (0, "--socket-mem%c", 0); + vec_add1 (conf->eal_init_args, tmp); + tmp = format (0, "%s%c", socket_mem, 0); + vec_add1 (conf->eal_init_args, tmp); + + /* NULL terminate the "argv" vector, in case of stupidity */ + vec_add1 (conf->eal_init_args, 0); + _vec_len (conf->eal_init_args) -= 1; + + /* Set up DPDK eal and packet mbuf pool early. */ + + rte_log_set_global_level (log_level); + + vm = vlib_get_main (); + + /* make copy of args as rte_eal_init tends to mess up with arg array */ + for (i = 1; i < vec_len (conf->eal_init_args); i++) + conf->eal_init_args_str = format (conf->eal_init_args_str, "%s ", + conf->eal_init_args[i]); + + clib_warning ("EAL init args: %s", conf->eal_init_args_str); + ret = + rte_eal_init (vec_len (conf->eal_init_args), + (char **) conf->eal_init_args); + + /* lazy umount hugepages */ + umount2 ((char *) huge_dir_path, MNT_DETACH); + rmdir ((char *) huge_dir_path); + vec_free (huge_dir_path); + + if (ret < 0) + return clib_error_return (0, "rte_eal_init returned %d", ret); + + /* Dump the physical memory layout prior to creating the mbuf_pool */ + fprintf (stdout, "DPDK physical memory layout:\n"); + rte_dump_physmem_layout (stdout); + + /* main thread 1st */ + error = dpdk_buffer_pool_create (vm, conf->num_mbufs, rte_socket_id ()); + if (error) + return error; + + for (i = 0; i < RTE_MAX_LCORE; i++) + { + error = dpdk_buffer_pool_create (vm, conf->num_mbufs, + rte_lcore_to_socket_id (i)); + if (error) + return error; + } + +done: + return error; +} + +VLIB_CONFIG_FUNCTION (dpdk_config, "dpdk"); + +void +dpdk_update_link_state (dpdk_device_t * xd, f64 now) +{ + vnet_main_t *vnm = vnet_get_main (); + struct rte_eth_link prev_link = xd->link; + u32 hw_flags = 0; + u8 hw_flags_chg = 0; + + /* only update link state for PMD interfaces */ + if ((xd->flags & DPDK_DEVICE_FLAG_PMD) == 0) + return; + + xd->time_last_link_update = now ? now : xd->time_last_link_update; + memset (&xd->link, 0, sizeof (xd->link)); + rte_eth_link_get_nowait (xd->device_index, &xd->link); + + if (LINK_STATE_ELOGS) + { + vlib_main_t *vm = vlib_get_main (); + ELOG_TYPE_DECLARE (e) = + { + .format = + "update-link-state: sw_if_index %d, admin_up %d," + "old link_state %d new link_state %d",.format_args = "i4i1i1i1",}; + + struct + { + u32 sw_if_index; + u8 admin_up; + u8 old_link_state; + u8 new_link_state; + } *ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->sw_if_index = xd->vlib_sw_if_index; + ed->admin_up = (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) != 0; + ed->old_link_state = (u8) + vnet_hw_interface_is_link_up (vnm, xd->hw_if_index); + ed->new_link_state = (u8) xd->link.link_status; + } + + if ((xd->flags & (DPDK_DEVICE_FLAG_ADMIN_UP | DPDK_DEVICE_FLAG_BOND_SLAVE)) + && ((xd->link.link_status != 0) ^ + vnet_hw_interface_is_link_up (vnm, xd->hw_if_index))) + { + hw_flags_chg = 1; + hw_flags |= (xd->link.link_status ? VNET_HW_INTERFACE_FLAG_LINK_UP : 0); + } + + if (hw_flags_chg || (xd->link.link_duplex != prev_link.link_duplex)) + { + hw_flags_chg = 1; + switch (xd->link.link_duplex) + { + case ETH_LINK_HALF_DUPLEX: + hw_flags |= VNET_HW_INTERFACE_FLAG_HALF_DUPLEX; + break; + case ETH_LINK_FULL_DUPLEX: + hw_flags |= VNET_HW_INTERFACE_FLAG_FULL_DUPLEX; + break; + default: + break; + } + } + if (hw_flags_chg || (xd->link.link_speed != prev_link.link_speed)) + { + hw_flags_chg = 1; + switch (xd->link.link_speed) + { + case ETH_SPEED_NUM_10M: + hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_10M; + break; + case ETH_SPEED_NUM_100M: + hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_100M; + break; + case ETH_SPEED_NUM_1G: + hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_1G; + break; + case ETH_SPEED_NUM_10G: + hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_10G; + break; + case ETH_SPEED_NUM_40G: + hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_40G; + break; + case 0: + break; + default: + clib_warning ("unknown link speed %d", xd->link.link_speed); + break; + } + } + if (hw_flags_chg) + { + if (LINK_STATE_ELOGS) + { + vlib_main_t *vm = vlib_get_main (); + + ELOG_TYPE_DECLARE (e) = + { + .format = + "update-link-state: sw_if_index %d, new flags %d",.format_args + = "i4i4",}; + + struct + { + u32 sw_if_index; + u32 flags; + } *ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->sw_if_index = xd->vlib_sw_if_index; + ed->flags = hw_flags; + } + vnet_hw_interface_set_flags (vnm, xd->hw_if_index, hw_flags); + } +} + +static uword +dpdk_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f) +{ + clib_error_t *error; + vnet_main_t *vnm = vnet_get_main (); + dpdk_main_t *dm = &dpdk_main; + ethernet_main_t *em = ðernet_main; + dpdk_device_t *xd; + vlib_thread_main_t *tm = vlib_get_thread_main (); + int i; + + error = dpdk_lib_init (dm); + + if (error) + clib_error_report (error); + + tm->worker_thread_release = 1; + + f64 now = vlib_time_now (vm); + vec_foreach (xd, dm->devices) + { + dpdk_update_link_state (xd, now); + } + + { + /* + * Extra set up for bond interfaces: + * 1. Setup MACs for bond interfaces and their slave links which was set + * in dpdk_device_setup() but needs to be done again here to take + * effect. + * 2. Set up info and register slave link state change callback handling. + * 3. Set up info for bond interface related CLI support. + */ + int nports = rte_eth_dev_count (); + if (nports > 0) + { + for (i = 0; i < nports; i++) + { + xd = &dm->devices[i]; + ASSERT (i == xd->device_index); + if (xd->pmd == VNET_DPDK_PMD_BOND) + { + u8 addr[6]; + u8 slink[16]; + int nlink = rte_eth_bond_slaves_get (i, slink, 16); + if (nlink > 0) + { + vnet_hw_interface_t *bhi; + ethernet_interface_t *bei; + int rv; + + /* Get MAC of 1st slave link */ + rte_eth_macaddr_get + (slink[0], (struct ether_addr *) addr); + + /* Set MAC of bounded interface to that of 1st slave link */ + clib_warning ("Set MAC for bond port %d BondEthernet%d", + i, xd->port_id); + rv = rte_eth_bond_mac_address_set + (i, (struct ether_addr *) addr); + if (rv) + clib_warning ("Set MAC addr failure rv=%d", rv); + + /* Populate MAC of bonded interface in VPP hw tables */ + bhi = vnet_get_hw_interface + (vnm, dm->devices[i].hw_if_index); + bei = pool_elt_at_index + (em->interfaces, bhi->hw_instance); + clib_memcpy (bhi->hw_address, addr, 6); + clib_memcpy (bei->address, addr, 6); + + /* Init l3 packet size allowed on bonded interface */ + bhi->max_packet_bytes = ETHERNET_MAX_PACKET_BYTES; + bhi->max_l3_packet_bytes[VLIB_RX] = + bhi->max_l3_packet_bytes[VLIB_TX] = + ETHERNET_MAX_PACKET_BYTES - sizeof (ethernet_header_t); + while (nlink >= 1) + { /* for all slave links */ + int slave = slink[--nlink]; + dpdk_device_t *sdev = &dm->devices[slave]; + vnet_hw_interface_t *shi; + vnet_sw_interface_t *ssi; + ethernet_interface_t *sei; + /* Add MAC to all slave links except the first one */ + if (nlink) + { + clib_warning ("Add MAC for slave port %d", slave); + rv = rte_eth_dev_mac_addr_add + (slave, (struct ether_addr *) addr, 0); + if (rv) + clib_warning ("Add MAC addr failure rv=%d", rv); + } + /* Setup slave link state change callback handling */ + rte_eth_dev_callback_register + (slave, RTE_ETH_EVENT_INTR_LSC, + dpdk_port_state_callback, NULL); + dpdk_device_t *sxd = &dm->devices[slave]; + sxd->flags |= DPDK_DEVICE_FLAG_BOND_SLAVE; + sxd->bond_port = i; + /* Set slaves bitmap for bonded interface */ + bhi->bond_info = clib_bitmap_set + (bhi->bond_info, sdev->hw_if_index, 1); + /* Set MACs and slave link flags on slave interface */ + shi = vnet_get_hw_interface (vnm, sdev->hw_if_index); + ssi = vnet_get_sw_interface + (vnm, sdev->vlib_sw_if_index); + sei = pool_elt_at_index + (em->interfaces, shi->hw_instance); + shi->bond_info = VNET_HW_INTERFACE_BOND_INFO_SLAVE; + ssi->flags |= VNET_SW_INTERFACE_FLAG_BOND_SLAVE; + clib_memcpy (shi->hw_address, addr, 6); + clib_memcpy (sei->address, addr, 6); + /* Set l3 packet size allowed as the lowest of slave */ + if (bhi->max_l3_packet_bytes[VLIB_RX] > + shi->max_l3_packet_bytes[VLIB_RX]) + bhi->max_l3_packet_bytes[VLIB_RX] = + bhi->max_l3_packet_bytes[VLIB_TX] = + shi->max_l3_packet_bytes[VLIB_RX]; + /* Set max packet size allowed as the lowest of slave */ + if (bhi->max_packet_bytes > shi->max_packet_bytes) + bhi->max_packet_bytes = shi->max_packet_bytes; + } + } + } + } + } + } + + while (1) + { + /* + * check each time through the loop in case intervals are changed + */ + f64 min_wait = dm->link_state_poll_interval < dm->stat_poll_interval ? + dm->link_state_poll_interval : dm->stat_poll_interval; + + vlib_process_wait_for_event_or_clock (vm, min_wait); + + if (dm->admin_up_down_in_progress) + /* skip the poll if an admin up down is in progress (on any interface) */ + continue; + + vec_foreach (xd, dm->devices) + { + f64 now = vlib_time_now (vm); + if ((now - xd->time_last_stats_update) >= dm->stat_poll_interval) + dpdk_update_counters (xd, now); + if ((now - xd->time_last_link_update) >= dm->link_state_poll_interval) + dpdk_update_link_state (xd, now); + + } + } + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (dpdk_process_node,static) = { + .function = dpdk_process, + .type = VLIB_NODE_TYPE_PROCESS, + .name = "dpdk-process", + .process_log2_n_stack_bytes = 17, +}; +/* *INDENT-ON* */ + +static clib_error_t * +dpdk_init (vlib_main_t * vm) +{ + dpdk_main_t *dm = &dpdk_main; + vlib_node_t *ei; + clib_error_t *error = 0; + vlib_thread_main_t *tm = vlib_get_thread_main (); + + /* verify that structs are cacheline aligned */ + STATIC_ASSERT (offsetof (dpdk_device_t, cacheline0) == 0, + "Cache line marker must be 1st element in dpdk_device_t"); + STATIC_ASSERT (offsetof (dpdk_device_t, cacheline1) == + CLIB_CACHE_LINE_BYTES, + "Data in cache line 0 is bigger than cache line size"); + STATIC_ASSERT (offsetof (frame_queue_trace_t, cacheline0) == 0, + "Cache line marker must be 1st element in frame_queue_trace_t"); + + dm->vlib_main = vm; + dm->vnet_main = vnet_get_main (); + dm->conf = &dpdk_config_main; + + ei = vlib_get_node_by_name (vm, (u8 *) "ethernet-input"); + if (ei == 0) + return clib_error_return (0, "ethernet-input node AWOL"); + + dm->ethernet_input_node_index = ei->index; + + dm->conf->nchannels = 4; + dm->conf->num_mbufs = dm->conf->num_mbufs ? dm->conf->num_mbufs : NB_MBUF; + vec_add1 (dm->conf->eal_init_args, (u8 *) "vnet"); + + vec_validate (dm->recycle, tm->n_thread_stacks - 1); + + /* Default vlib_buffer_t flags, DISABLES tcp/udp checksumming... */ + dm->buffer_flags_template = + (VLIB_BUFFER_TOTAL_LENGTH_VALID | VLIB_BUFFER_EXT_HDR_VALID + | VNET_BUFFER_F_L4_CHECKSUM_COMPUTED | + VNET_BUFFER_F_L4_CHECKSUM_CORRECT); + + dm->stat_poll_interval = DPDK_STATS_POLL_INTERVAL; + dm->link_state_poll_interval = DPDK_LINK_POLL_INTERVAL; + + /* init CLI */ + if ((error = vlib_call_init_function (vm, dpdk_cli_init))) + return error; + + return error; +} + +VLIB_INIT_FUNCTION (dpdk_init); + + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/dpdk/device/node.c b/src/plugins/dpdk/device/node.c new file mode 100644 index 00000000..cf8b9699 --- /dev/null +++ b/src/plugins/dpdk/device/node.c @@ -0,0 +1,704 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vnet/vnet.h> +#include <vppinfra/vec.h> +#include <vppinfra/error.h> +#include <vppinfra/format.h> +#include <vppinfra/xxhash.h> + +#include <vnet/ethernet/ethernet.h> +#include <dpdk/device/dpdk.h> +#include <vnet/classify/vnet_classify.h> +#include <vnet/mpls/packet.h> +#include <vnet/handoff.h> +#include <vnet/devices/devices.h> +#include <vnet/feature/feature.h> + +#include <dpdk/device/dpdk_priv.h> + +static char *dpdk_error_strings[] = { +#define _(n,s) s, + foreach_dpdk_error +#undef _ +}; + +always_inline int +vlib_buffer_is_ip4 (vlib_buffer_t * b) +{ + ethernet_header_t *h = (ethernet_header_t *) vlib_buffer_get_current (b); + return (h->type == clib_host_to_net_u16 (ETHERNET_TYPE_IP4)); +} + +always_inline int +vlib_buffer_is_ip6 (vlib_buffer_t * b) +{ + ethernet_header_t *h = (ethernet_header_t *) vlib_buffer_get_current (b); + return (h->type == clib_host_to_net_u16 (ETHERNET_TYPE_IP6)); +} + +always_inline int +vlib_buffer_is_mpls (vlib_buffer_t * b) +{ + ethernet_header_t *h = (ethernet_header_t *) vlib_buffer_get_current (b); + return (h->type == clib_host_to_net_u16 (ETHERNET_TYPE_MPLS)); +} + +always_inline u32 +dpdk_rx_next_from_etype (struct rte_mbuf * mb, vlib_buffer_t * b0) +{ + if (PREDICT_TRUE (vlib_buffer_is_ip4 (b0))) + { + if (PREDICT_TRUE ((mb->ol_flags & PKT_RX_IP_CKSUM_GOOD) != 0)) + return VNET_DEVICE_INPUT_NEXT_IP4_NCS_INPUT; + else + return VNET_DEVICE_INPUT_NEXT_IP4_INPUT; + } + else if (PREDICT_TRUE (vlib_buffer_is_ip6 (b0))) + return VNET_DEVICE_INPUT_NEXT_IP6_INPUT; + else if (PREDICT_TRUE (vlib_buffer_is_mpls (b0))) + return VNET_DEVICE_INPUT_NEXT_MPLS_INPUT; + else + return VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT; +} + +always_inline u32 +dpdk_rx_next_from_packet_start (struct rte_mbuf * mb, vlib_buffer_t * b0) +{ + word start_delta; + int rv; + + start_delta = b0->current_data - + ((mb->buf_addr + mb->data_off) - (void *) b0->data); + + vlib_buffer_advance (b0, -start_delta); + + if (PREDICT_TRUE (vlib_buffer_is_ip4 (b0))) + { + if (PREDICT_TRUE ((mb->ol_flags & PKT_RX_IP_CKSUM_GOOD) != 0)) + rv = VNET_DEVICE_INPUT_NEXT_IP4_NCS_INPUT; + else + rv = VNET_DEVICE_INPUT_NEXT_IP4_INPUT; + } + else if (PREDICT_TRUE (vlib_buffer_is_ip6 (b0))) + rv = VNET_DEVICE_INPUT_NEXT_IP6_INPUT; + else if (PREDICT_TRUE (vlib_buffer_is_mpls (b0))) + rv = VNET_DEVICE_INPUT_NEXT_MPLS_INPUT; + else + rv = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT; + + vlib_buffer_advance (b0, start_delta); + return rv; +} + +always_inline void +dpdk_rx_error_from_mb (struct rte_mbuf *mb, u32 * next, u8 * error) +{ + if (mb->ol_flags & PKT_RX_IP_CKSUM_BAD) + { + *error = DPDK_ERROR_IP_CHECKSUM_ERROR; + *next = VNET_DEVICE_INPUT_NEXT_DROP; + } + else + *error = DPDK_ERROR_NONE; +} + +static void +dpdk_rx_trace (dpdk_main_t * dm, + vlib_node_runtime_t * node, + dpdk_device_t * xd, + u16 queue_id, u32 * buffers, uword n_buffers) +{ + vlib_main_t *vm = vlib_get_main (); + u32 *b, n_left; + u32 next0; + + n_left = n_buffers; + b = buffers; + + while (n_left >= 1) + { + u32 bi0; + vlib_buffer_t *b0; + dpdk_rx_dma_trace_t *t0; + struct rte_mbuf *mb; + u8 error0; + + bi0 = b[0]; + n_left -= 1; + + b0 = vlib_get_buffer (vm, bi0); + mb = rte_mbuf_from_vlib_buffer (b0); + + if (PREDICT_FALSE (xd->per_interface_next_index != ~0)) + next0 = xd->per_interface_next_index; + else + next0 = dpdk_rx_next_from_packet_start (mb, b0); + + dpdk_rx_error_from_mb (mb, &next0, &error0); + + vlib_trace_buffer (vm, node, next0, b0, /* follow_chain */ 0); + t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0])); + t0->queue_index = queue_id; + t0->device_index = xd->device_index; + t0->buffer_index = bi0; + + clib_memcpy (&t0->mb, mb, sizeof (t0->mb)); + clib_memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data)); + clib_memcpy (t0->buffer.pre_data, b0->data, + sizeof (t0->buffer.pre_data)); + clib_memcpy (&t0->data, mb->buf_addr + mb->data_off, sizeof (t0->data)); + + b += 1; + } +} + +static inline u32 +dpdk_rx_burst (dpdk_main_t * dm, dpdk_device_t * xd, u16 queue_id) +{ + u32 n_buffers; + u32 n_left; + u32 n_this_chunk; + + n_left = VLIB_FRAME_SIZE; + n_buffers = 0; + + if (PREDICT_TRUE (xd->flags & DPDK_DEVICE_FLAG_PMD)) + { + while (n_left) + { + n_this_chunk = rte_eth_rx_burst (xd->device_index, queue_id, + xd->rx_vectors[queue_id] + + n_buffers, n_left); + n_buffers += n_this_chunk; + n_left -= n_this_chunk; + + /* Empirically, DPDK r1.8 produces vectors w/ 32 or fewer elts */ + if (n_this_chunk < 32) + break; + } + } + else + { + ASSERT (0); + } + + return n_buffers; +} + + +static_always_inline void +dpdk_process_subseq_segs (vlib_main_t * vm, vlib_buffer_t * b, + struct rte_mbuf *mb, vlib_buffer_free_list_t * fl) +{ + u8 nb_seg = 1; + struct rte_mbuf *mb_seg = 0; + vlib_buffer_t *b_seg, *b_chain = 0; + mb_seg = mb->next; + b_chain = b; + + if (mb->nb_segs < 2) + return; + + b->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID; + b->total_length_not_including_first_buffer = 0; + + while (nb_seg < mb->nb_segs) + { + ASSERT (mb_seg != 0); + + b_seg = vlib_buffer_from_rte_mbuf (mb_seg); + vlib_buffer_init_for_free_list (b_seg, fl); + + ASSERT ((b_seg->flags & VLIB_BUFFER_NEXT_PRESENT) == 0); + ASSERT (b_seg->current_data == 0); + + /* + * The driver (e.g. virtio) may not put the packet data at the start + * of the segment, so don't assume b_seg->current_data == 0 is correct. + */ + b_seg->current_data = + (mb_seg->buf_addr + mb_seg->data_off) - (void *) b_seg->data; + + b_seg->current_length = mb_seg->data_len; + b->total_length_not_including_first_buffer += mb_seg->data_len; + + b_chain->flags |= VLIB_BUFFER_NEXT_PRESENT; + b_chain->next_buffer = vlib_get_buffer_index (vm, b_seg); + + b_chain = b_seg; + mb_seg = mb_seg->next; + nb_seg++; + } +} + +static_always_inline void +dpdk_prefetch_buffer (struct rte_mbuf *mb) +{ + vlib_buffer_t *b = vlib_buffer_from_rte_mbuf (mb); + CLIB_PREFETCH (mb, CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (b, CLIB_CACHE_LINE_BYTES, STORE); +} + +static_always_inline void +dpdk_prefetch_ethertype (struct rte_mbuf *mb) +{ + CLIB_PREFETCH (mb->buf_addr + mb->data_off + + STRUCT_OFFSET_OF (ethernet_header_t, type), + CLIB_CACHE_LINE_BYTES, LOAD); +} + + +/* + This function should fill 1st cacheline of vlib_buffer_t metadata with data + from buffer template. Instead of filling field by field, we construct + template and then use 128/256 bit vector instruction to copy data. + This code first loads whole cacheline into 4 128-bit registers (xmm) + or two 256 bit registers (ymm) and then stores data into all 4 buffers + efectively saving on register load operations. +*/ + +static_always_inline void +dpdk_buffer_init_from_template (void *d0, void *d1, void *d2, void *d3, + void *s) +{ +#if defined(CLIB_HAVE_VEC128) + int i; + for (i = 0; i < 2; i++) + { + *(u8x32 *) (((u8 *) d0) + i * 32) = + *(u8x32 *) (((u8 *) d1) + i * 32) = + *(u8x32 *) (((u8 *) d2) + i * 32) = + *(u8x32 *) (((u8 *) d3) + i * 32) = *(u8x32 *) (((u8 *) s) + i * 32); + } +#elif defined(CLIB_HAVE_VEC64) + int i; + for (i = 0; i < 4; i++) + { + *(u8x16 *) (((u8 *) d0) + i * 16) = + *(u8x16 *) (((u8 *) d1) + i * 16) = + *(u8x16 *) (((u8 *) d2) + i * 16) = + *(u8x16 *) (((u8 *) d3) + i * 16) = *(u8x16 *) (((u8 *) s) + i * 16); + } +#else +#error "Either CLIB_HAVE_VEC128 or CLIB_HAVE_VEC64 has to be defined" +#endif +} + +/* + * This function is used when there are no worker threads. + * The main thread performs IO and forwards the packets. + */ +static_always_inline u32 +dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, + vlib_node_runtime_t * node, u32 thread_index, u16 queue_id, + int maybe_multiseg) +{ + u32 n_buffers; + u32 next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT; + u32 n_left_to_next, *to_next; + u32 mb_index; + vlib_main_t *vm = vlib_get_main (); + uword n_rx_bytes = 0; + u32 n_trace, trace_cnt __attribute__ ((unused)); + vlib_buffer_free_list_t *fl; + vlib_buffer_t *bt = vec_elt_at_index (dm->buffer_templates, thread_index); + + if ((xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) == 0) + return 0; + + n_buffers = dpdk_rx_burst (dm, xd, queue_id); + + if (n_buffers == 0) + { + return 0; + } + + vec_reset_length (xd->d_trace_buffers[thread_index]); + trace_cnt = n_trace = vlib_get_trace_count (vm, node); + + if (n_trace > 0) + { + u32 n = clib_min (n_trace, n_buffers); + mb_index = 0; + + while (n--) + { + struct rte_mbuf *mb = xd->rx_vectors[queue_id][mb_index++]; + vlib_buffer_t *b = vlib_buffer_from_rte_mbuf (mb); + vec_add1 (xd->d_trace_buffers[thread_index], + vlib_get_buffer_index (vm, b)); + } + } + + fl = vlib_buffer_get_free_list (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + + /* Update buffer template */ + vnet_buffer (bt)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index; + bt->error = node->errors[DPDK_ERROR_NONE]; + + mb_index = 0; + + while (n_buffers > 0) + { + vlib_buffer_t *b0, *b1, *b2, *b3; + u32 bi0, next0; + u32 bi1, next1; + u32 bi2, next2; + u32 bi3, next3; + u8 error0, error1, error2, error3; + u64 or_ol_flags; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_buffers >= 12 && n_left_to_next >= 4) + { + struct rte_mbuf *mb0, *mb1, *mb2, *mb3; + + /* prefetches are interleaved with the rest of the code to reduce + pressure on L1 cache */ + dpdk_prefetch_buffer (xd->rx_vectors[queue_id][mb_index + 8]); + dpdk_prefetch_ethertype (xd->rx_vectors[queue_id][mb_index + 4]); + + mb0 = xd->rx_vectors[queue_id][mb_index]; + mb1 = xd->rx_vectors[queue_id][mb_index + 1]; + mb2 = xd->rx_vectors[queue_id][mb_index + 2]; + mb3 = xd->rx_vectors[queue_id][mb_index + 3]; + + ASSERT (mb0); + ASSERT (mb1); + ASSERT (mb2); + ASSERT (mb3); + + if (maybe_multiseg) + { + if (PREDICT_FALSE (mb0->nb_segs > 1)) + dpdk_prefetch_buffer (mb0->next); + if (PREDICT_FALSE (mb1->nb_segs > 1)) + dpdk_prefetch_buffer (mb1->next); + if (PREDICT_FALSE (mb2->nb_segs > 1)) + dpdk_prefetch_buffer (mb2->next); + if (PREDICT_FALSE (mb3->nb_segs > 1)) + dpdk_prefetch_buffer (mb3->next); + } + + b0 = vlib_buffer_from_rte_mbuf (mb0); + b1 = vlib_buffer_from_rte_mbuf (mb1); + b2 = vlib_buffer_from_rte_mbuf (mb2); + b3 = vlib_buffer_from_rte_mbuf (mb3); + + dpdk_buffer_init_from_template (b0, b1, b2, b3, bt); + + dpdk_prefetch_buffer (xd->rx_vectors[queue_id][mb_index + 9]); + dpdk_prefetch_ethertype (xd->rx_vectors[queue_id][mb_index + 5]); + + /* current_data must be set to -RTE_PKTMBUF_HEADROOM in template */ + b0->current_data += mb0->data_off; + b1->current_data += mb1->data_off; + b2->current_data += mb2->data_off; + b3->current_data += mb3->data_off; + + b0->current_length = mb0->data_len; + b1->current_length = mb1->data_len; + b2->current_length = mb2->data_len; + b3->current_length = mb3->data_len; + + dpdk_prefetch_buffer (xd->rx_vectors[queue_id][mb_index + 10]); + dpdk_prefetch_ethertype (xd->rx_vectors[queue_id][mb_index + 7]); + + bi0 = vlib_get_buffer_index (vm, b0); + bi1 = vlib_get_buffer_index (vm, b1); + bi2 = vlib_get_buffer_index (vm, b2); + bi3 = vlib_get_buffer_index (vm, b3); + + to_next[0] = bi0; + to_next[1] = bi1; + to_next[2] = bi2; + to_next[3] = bi3; + to_next += 4; + n_left_to_next -= 4; + + if (PREDICT_FALSE (xd->per_interface_next_index != ~0)) + { + next0 = next1 = next2 = next3 = xd->per_interface_next_index; + } + else + { + next0 = dpdk_rx_next_from_etype (mb0, b0); + next1 = dpdk_rx_next_from_etype (mb1, b1); + next2 = dpdk_rx_next_from_etype (mb2, b2); + next3 = dpdk_rx_next_from_etype (mb3, b3); + } + + dpdk_prefetch_buffer (xd->rx_vectors[queue_id][mb_index + 11]); + dpdk_prefetch_ethertype (xd->rx_vectors[queue_id][mb_index + 6]); + + or_ol_flags = (mb0->ol_flags | mb1->ol_flags | + mb2->ol_flags | mb3->ol_flags); + if (PREDICT_FALSE (or_ol_flags & PKT_RX_IP_CKSUM_BAD)) + { + dpdk_rx_error_from_mb (mb0, &next0, &error0); + dpdk_rx_error_from_mb (mb1, &next1, &error1); + dpdk_rx_error_from_mb (mb2, &next2, &error2); + dpdk_rx_error_from_mb (mb3, &next3, &error3); + b0->error = node->errors[error0]; + b1->error = node->errors[error1]; + b2->error = node->errors[error2]; + b3->error = node->errors[error3]; + } + + vlib_buffer_advance (b0, device_input_next_node_advance[next0]); + vlib_buffer_advance (b1, device_input_next_node_advance[next1]); + vlib_buffer_advance (b2, device_input_next_node_advance[next2]); + vlib_buffer_advance (b3, device_input_next_node_advance[next3]); + + n_rx_bytes += mb0->pkt_len; + n_rx_bytes += mb1->pkt_len; + n_rx_bytes += mb2->pkt_len; + n_rx_bytes += mb3->pkt_len; + + /* Process subsequent segments of multi-segment packets */ + if (maybe_multiseg) + { + dpdk_process_subseq_segs (vm, b0, mb0, fl); + dpdk_process_subseq_segs (vm, b1, mb1, fl); + dpdk_process_subseq_segs (vm, b2, mb2, fl); + dpdk_process_subseq_segs (vm, b3, mb3, fl); + } + + /* + * Turn this on if you run into + * "bad monkey" contexts, and you want to know exactly + * which nodes they've visited... See main.c... + */ + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b1); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b2); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b3); + + /* Do we have any driver RX features configured on the interface? */ + vnet_feature_start_device_input_x4 (xd->vlib_sw_if_index, + &next0, &next1, &next2, &next3, + b0, b1, b2, b3); + + vlib_validate_buffer_enqueue_x4 (vm, node, next_index, + to_next, n_left_to_next, + bi0, bi1, bi2, bi3, + next0, next1, next2, next3); + n_buffers -= 4; + mb_index += 4; + } + while (n_buffers > 0 && n_left_to_next > 0) + { + struct rte_mbuf *mb0 = xd->rx_vectors[queue_id][mb_index]; + + if (PREDICT_TRUE (n_buffers > 3)) + { + dpdk_prefetch_buffer (xd->rx_vectors[queue_id][mb_index + 2]); + dpdk_prefetch_ethertype (xd->rx_vectors[queue_id] + [mb_index + 1]); + } + + ASSERT (mb0); + + b0 = vlib_buffer_from_rte_mbuf (mb0); + + /* Prefetch one next segment if it exists. */ + if (PREDICT_FALSE (mb0->nb_segs > 1)) + dpdk_prefetch_buffer (mb0->next); + + clib_memcpy (b0, bt, CLIB_CACHE_LINE_BYTES); + + ASSERT (b0->current_data == -RTE_PKTMBUF_HEADROOM); + b0->current_data += mb0->data_off; + b0->current_length = mb0->data_len; + + bi0 = vlib_get_buffer_index (vm, b0); + + to_next[0] = bi0; + to_next++; + n_left_to_next--; + + if (PREDICT_FALSE (xd->per_interface_next_index != ~0)) + next0 = xd->per_interface_next_index; + else + next0 = dpdk_rx_next_from_etype (mb0, b0); + + dpdk_rx_error_from_mb (mb0, &next0, &error0); + b0->error = node->errors[error0]; + + vlib_buffer_advance (b0, device_input_next_node_advance[next0]); + + n_rx_bytes += mb0->pkt_len; + + /* Process subsequent segments of multi-segment packets */ + dpdk_process_subseq_segs (vm, b0, mb0, fl); + + /* + * Turn this on if you run into + * "bad monkey" contexts, and you want to know exactly + * which nodes they've visited... See main.c... + */ + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); + + /* Do we have any driver RX features configured on the interface? */ + vnet_feature_start_device_input_x1 (xd->vlib_sw_if_index, &next0, + b0); + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + n_buffers--; + mb_index++; + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + if (PREDICT_FALSE (vec_len (xd->d_trace_buffers[thread_index]) > 0)) + { + dpdk_rx_trace (dm, node, xd, queue_id, + xd->d_trace_buffers[thread_index], + vec_len (xd->d_trace_buffers[thread_index])); + vlib_set_trace_count (vm, node, + n_trace - + vec_len (xd->d_trace_buffers[thread_index])); + } + + vlib_increment_combined_counter + (vnet_get_main ()->interface_main.combined_sw_if_counters + + VNET_INTERFACE_COUNTER_RX, + thread_index, xd->vlib_sw_if_index, mb_index, n_rx_bytes); + + vnet_device_increment_rx_packets (thread_index, mb_index); + + return mb_index; +} + +static inline void +poll_rate_limit (dpdk_main_t * dm) +{ + /* Limit the poll rate by sleeping for N msec between polls */ + if (PREDICT_FALSE (dm->poll_sleep_usec != 0)) + { + struct timespec ts, tsrem; + + ts.tv_sec = 0; + ts.tv_nsec = 1000 * dm->poll_sleep_usec; + + while (nanosleep (&ts, &tsrem) < 0) + { + ts = tsrem; + } + } +} + +/** \brief Main DPDK input node + @node dpdk-input + + This is the main DPDK input node: across each assigned interface, + call rte_eth_rx_burst(...) or similar to obtain a vector of + packets to process. Handle early packet discard. Derive @c + vlib_buffer_t metadata from <code>struct rte_mbuf</code> metadata, + Depending on the resulting metadata: adjust <code>b->current_data, + b->current_length </code> and dispatch directly to + ip4-input-no-checksum, or ip6-input. Trace the packet if required. + + @param vm vlib_main_t corresponding to the current thread + @param node vlib_node_runtime_t + @param f vlib_frame_t input-node, not used. + + @par Graph mechanics: buffer metadata, next index usage + + @em Uses: + - <code>struct rte_mbuf mb->ol_flags</code> + - PKT_RX_IP_CKSUM_BAD + - <code> RTE_ETH_IS_xxx_HDR(mb->packet_type) </code> + - packet classification result + + @em Sets: + - <code>b->error</code> if the packet is to be dropped immediately + - <code>b->current_data, b->current_length</code> + - adjusted as needed to skip the L2 header in direct-dispatch cases + - <code>vnet_buffer(b)->sw_if_index[VLIB_RX]</code> + - rx interface sw_if_index + - <code>vnet_buffer(b)->sw_if_index[VLIB_TX] = ~0</code> + - required by ipX-lookup + - <code>b->flags</code> + - to indicate multi-segment pkts (VLIB_BUFFER_NEXT_PRESENT), etc. + + <em>Next Nodes:</em> + - Static arcs to: error-drop, ethernet-input, + ip4-input-no-checksum, ip6-input, mpls-input + - per-interface redirection, controlled by + <code>xd->per_interface_next_index</code> +*/ + +static uword +dpdk_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * f) +{ + dpdk_main_t *dm = &dpdk_main; + dpdk_device_t *xd; + uword n_rx_packets = 0; + vnet_device_input_runtime_t *rt = (void *) node->runtime_data; + vnet_device_and_queue_t *dq; + u32 thread_index = node->thread_index; + + /* + * Poll all devices on this cpu for input/interrupts. + */ + /* *INDENT-OFF* */ + foreach_device_and_queue (dq, rt->devices_and_queues) + { + xd = vec_elt_at_index(dm->devices, dq->dev_instance); + if (PREDICT_FALSE (xd->flags & DPDK_DEVICE_FLAG_BOND_SLAVE)) + continue; /* Do not poll slave to a bonded interface */ + if (xd->flags & DPDK_DEVICE_FLAG_MAYBE_MULTISEG) + n_rx_packets += dpdk_device_input (dm, xd, node, thread_index, dq->queue_id, /* maybe_multiseg */ 1); + else + n_rx_packets += dpdk_device_input (dm, xd, node, thread_index, dq->queue_id, /* maybe_multiseg */ 0); + } + /* *INDENT-ON* */ + + poll_rate_limit (dm); + + return n_rx_packets; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (dpdk_input_node) = { + .function = dpdk_input, + .type = VLIB_NODE_TYPE_INPUT, + .name = "dpdk-input", + .sibling_of = "device-input", + + /* Will be enabled if/when hardware is detected. */ + .state = VLIB_NODE_STATE_DISABLED, + + .format_buffer = format_ethernet_header_with_length, + .format_trace = format_dpdk_rx_dma_trace, + + .n_errors = DPDK_N_ERROR, + .error_strings = dpdk_error_strings, +}; + +VLIB_NODE_FUNCTION_MULTIARCH (dpdk_input_node, dpdk_input); +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/dpdk/hqos/hqos.c b/src/plugins/dpdk/hqos/hqos.c new file mode 100644 index 00000000..c9b85652 --- /dev/null +++ b/src/plugins/dpdk/hqos/hqos.c @@ -0,0 +1,772 @@ +/* + * Copyright(c) 2016 Intel Corporation. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/stat.h> +#include <sys/mount.h> +#include <string.h> +#include <fcntl.h> + +#include <vppinfra/vec.h> +#include <vppinfra/error.h> +#include <vppinfra/format.h> +#include <vppinfra/bitmap.h> + +#include <vnet/vnet.h> +#include <vnet/ethernet/ethernet.h> +#include <dpdk/device/dpdk.h> + +#include <vlib/pci/pci.h> +#include <vlibmemory/api.h> +#include <vlibmemory/vl_memory_msg_enum.h> /* enumerate all vlib messages */ + +#define vl_typedefs /* define message structures */ +#include <vlibmemory/vl_memory_api_h.h> +#undef vl_typedefs + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__) +#define vl_printfun +#include <vlibmemory/vl_memory_api_h.h> +#undef vl_printfun + +#include <dpdk/device/dpdk_priv.h> + +/*** + * + * HQoS default configuration values + * + ***/ + +static dpdk_device_config_hqos_t hqos_params_default = { + .hqos_thread_valid = 0, + + .swq_size = 4096, + .burst_enq = 256, + .burst_deq = 220, + + /* + * Packet field to identify the subport. + * + * Default value: Since only one subport is defined by default (see below: + * n_subports_per_port = 1), the subport ID is hardcoded to 0. + */ + .pktfield0_slabpos = 0, + .pktfield0_slabmask = 0, + + /* + * Packet field to identify the pipe. + * + * Default value: Assuming Ethernet/IPv4/UDP packets, UDP payload bits 12 .. 23 + */ + .pktfield1_slabpos = 40, + .pktfield1_slabmask = 0x0000000FFF000000LLU, + + /* Packet field used as index into TC translation table to identify the traffic + * class and queue. + * + * Default value: Assuming Ethernet/IPv4 packets, IPv4 DSCP field + */ + .pktfield2_slabpos = 8, + .pktfield2_slabmask = 0x00000000000000FCLLU, + .tc_table = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + }, + + /* port */ + .port = { + .name = NULL, /* Set at init */ + .socket = 0, /* Set at init */ + .rate = 1250000000, /* Assuming 10GbE port */ + .mtu = 14 + 1500, /* Assuming Ethernet/IPv4 pkt (Ethernet FCS not included) */ + .frame_overhead = RTE_SCHED_FRAME_OVERHEAD_DEFAULT, + .n_subports_per_port = 1, + .n_pipes_per_subport = 4096, + .qsize = {64, 64, 64, 64}, + .pipe_profiles = NULL, /* Set at config */ + .n_pipe_profiles = 1, + +#ifdef RTE_SCHED_RED + .red_params = { + /* Traffic Class 0 Colors Green / Yellow / Red */ + [0][0] = {.min_th = 48,.max_th = 64,.maxp_inv = + 10,.wq_log2 = 9}, + [0][1] = {.min_th = 40,.max_th = 64,.maxp_inv = + 10,.wq_log2 = 9}, + [0][2] = {.min_th = 32,.max_th = 64,.maxp_inv = + 10,.wq_log2 = 9}, + + /* Traffic Class 1 - Colors Green / Yellow / Red */ + [1][0] = {.min_th = 48,.max_th = 64,.maxp_inv = + 10,.wq_log2 = 9}, + [1][1] = {.min_th = 40,.max_th = 64,.maxp_inv = + 10,.wq_log2 = 9}, + [1][2] = {.min_th = 32,.max_th = 64,.maxp_inv = + 10,.wq_log2 = 9}, + + /* Traffic Class 2 - Colors Green / Yellow / Red */ + [2][0] = {.min_th = 48,.max_th = 64,.maxp_inv = + 10,.wq_log2 = 9}, + [2][1] = {.min_th = 40,.max_th = 64,.maxp_inv = + 10,.wq_log2 = 9}, + [2][2] = {.min_th = 32,.max_th = 64,.maxp_inv = + 10,.wq_log2 = 9}, + + /* Traffic Class 3 - Colors Green / Yellow / Red */ + [3][0] = {.min_th = 48,.max_th = 64,.maxp_inv = + 10,.wq_log2 = 9}, + [3][1] = {.min_th = 40,.max_th = 64,.maxp_inv = + 10,.wq_log2 = 9}, + [3][2] = {.min_th = 32,.max_th = 64,.maxp_inv = + 10,.wq_log2 = 9} + }, +#endif /* RTE_SCHED_RED */ + }, +}; + +static struct rte_sched_subport_params hqos_subport_params_default = { + .tb_rate = 1250000000, /* 10GbE line rate (measured in bytes/second) */ + .tb_size = 1000000, + .tc_rate = {1250000000, 1250000000, 1250000000, 1250000000}, + .tc_period = 10, +}; + +static struct rte_sched_pipe_params hqos_pipe_params_default = { + .tb_rate = 305175, /* 10GbE line rate divided by 4K pipes */ + .tb_size = 1000000, + .tc_rate = {305175, 305175, 305175, 305175}, + .tc_period = 40, +#ifdef RTE_SCHED_SUBPORT_TC_OV + .tc_ov_weight = 1, +#endif + .wrr_weights = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, +}; + +/*** + * + * HQoS configuration + * + ***/ + +int +dpdk_hqos_validate_mask (u64 mask, u32 n) +{ + int count = __builtin_popcountll (mask); + int pos_lead = sizeof (u64) * 8 - __builtin_clzll (mask); + int pos_trail = __builtin_ctzll (mask); + int count_expected = __builtin_popcount (n - 1); + + /* Handle the exceptions */ + if (n == 0) + return -1; /* Error */ + + if ((mask == 0) && (n == 1)) + return 0; /* OK */ + + if (((mask == 0) && (n != 1)) || ((mask != 0) && (n == 1))) + return -2; /* Error */ + + /* Check that mask is contiguous */ + if ((pos_lead - pos_trail) != count) + return -3; /* Error */ + + /* Check that mask contains the expected number of bits set */ + if (count != count_expected) + return -4; /* Error */ + + return 0; /* OK */ +} + +void +dpdk_device_config_hqos_pipe_profile_default (dpdk_device_config_hqos_t * + hqos, u32 pipe_profile_id) +{ + memcpy (&hqos->pipe[pipe_profile_id], &hqos_pipe_params_default, + sizeof (hqos_pipe_params_default)); +} + +void +dpdk_device_config_hqos_default (dpdk_device_config_hqos_t * hqos) +{ + struct rte_sched_subport_params *subport_params; + struct rte_sched_pipe_params *pipe_params; + u32 *pipe_map; + u32 i; + + memcpy (hqos, &hqos_params_default, sizeof (hqos_params_default)); + + /* pipe */ + vec_add2 (hqos->pipe, pipe_params, hqos->port.n_pipe_profiles); + + for (i = 0; i < vec_len (hqos->pipe); i++) + memcpy (&pipe_params[i], + &hqos_pipe_params_default, sizeof (hqos_pipe_params_default)); + + hqos->port.pipe_profiles = hqos->pipe; + + /* subport */ + vec_add2 (hqos->subport, subport_params, hqos->port.n_subports_per_port); + + for (i = 0; i < vec_len (hqos->subport); i++) + memcpy (&subport_params[i], + &hqos_subport_params_default, + sizeof (hqos_subport_params_default)); + + /* pipe profile */ + vec_add2 (hqos->pipe_map, + pipe_map, + hqos->port.n_subports_per_port * hqos->port.n_pipes_per_subport); + + for (i = 0; i < vec_len (hqos->pipe_map); i++) + pipe_map[i] = 0; +} + +/*** + * + * HQoS init + * + ***/ + +clib_error_t * +dpdk_port_setup_hqos (dpdk_device_t * xd, dpdk_device_config_hqos_t * hqos) +{ + vlib_thread_main_t *tm = vlib_get_thread_main (); + char name[32]; + u32 subport_id, i; + int rv; + + /* Detect the set of worker threads */ + int worker_thread_first = 0; + int worker_thread_count = 0; + + uword *p = hash_get_mem (tm->thread_registrations_by_name, "workers"); + vlib_thread_registration_t *tr = + p ? (vlib_thread_registration_t *) p[0] : 0; + + if (tr && tr->count > 0) + { + worker_thread_first = tr->first_index; + worker_thread_count = tr->count; + } + + /* Allocate the per-thread device data array */ + vec_validate_aligned (xd->hqos_wt, tm->n_vlib_mains - 1, + CLIB_CACHE_LINE_BYTES); + memset (xd->hqos_wt, 0, tm->n_vlib_mains * sizeof (xd->hqos_wt[0])); + + vec_validate_aligned (xd->hqos_ht, 0, CLIB_CACHE_LINE_BYTES); + memset (xd->hqos_ht, 0, sizeof (xd->hqos_ht[0])); + + /* Allocate space for one SWQ per worker thread in the I/O TX thread data structure */ + vec_validate (xd->hqos_ht->swq, worker_thread_count); + + /* SWQ */ + for (i = 0; i < worker_thread_count + 1; i++) + { + u32 swq_flags = RING_F_SP_ENQ | RING_F_SC_DEQ; + + snprintf (name, sizeof (name), "SWQ-worker%u-to-device%u", i, + xd->device_index); + xd->hqos_ht->swq[i] = + rte_ring_create (name, hqos->swq_size, xd->cpu_socket, swq_flags); + if (xd->hqos_ht->swq[i] == NULL) + return clib_error_return (0, + "SWQ-worker%u-to-device%u: rte_ring_create err", + i, xd->device_index); + } + + /* + * HQoS + */ + + /* HQoS port */ + snprintf (name, sizeof (name), "HQoS%u", xd->device_index); + hqos->port.name = strdup (name); + if (hqos->port.name == NULL) + return clib_error_return (0, "HQoS%u: strdup err", xd->device_index); + + hqos->port.socket = rte_eth_dev_socket_id (xd->device_index); + if (hqos->port.socket == SOCKET_ID_ANY) + hqos->port.socket = 0; + + xd->hqos_ht->hqos = rte_sched_port_config (&hqos->port); + if (xd->hqos_ht->hqos == NULL) + return clib_error_return (0, "HQoS%u: rte_sched_port_config err", + xd->device_index); + + /* HQoS subport */ + for (subport_id = 0; subport_id < hqos->port.n_subports_per_port; + subport_id++) + { + u32 pipe_id; + + rv = + rte_sched_subport_config (xd->hqos_ht->hqos, subport_id, + &hqos->subport[subport_id]); + if (rv) + return clib_error_return (0, + "HQoS%u subport %u: rte_sched_subport_config err (%d)", + xd->device_index, subport_id, rv); + + /* HQoS pipe */ + for (pipe_id = 0; pipe_id < hqos->port.n_pipes_per_subport; pipe_id++) + { + u32 pos = subport_id * hqos->port.n_pipes_per_subport + pipe_id; + u32 profile_id = hqos->pipe_map[pos]; + + rv = + rte_sched_pipe_config (xd->hqos_ht->hqos, subport_id, pipe_id, + profile_id); + if (rv) + return clib_error_return (0, + "HQoS%u subport %u pipe %u: rte_sched_pipe_config err (%d)", + xd->device_index, subport_id, pipe_id, + rv); + } + } + + /* Set up per-thread device data for the I/O TX thread */ + xd->hqos_ht->hqos_burst_enq = hqos->burst_enq; + xd->hqos_ht->hqos_burst_deq = hqos->burst_deq; + vec_validate (xd->hqos_ht->pkts_enq, 2 * hqos->burst_enq - 1); + vec_validate (xd->hqos_ht->pkts_deq, hqos->burst_deq - 1); + xd->hqos_ht->pkts_enq_len = 0; + xd->hqos_ht->swq_pos = 0; + xd->hqos_ht->flush_count = 0; + + /* Set up per-thread device data for each worker thread */ + for (i = 0; i < worker_thread_count + 1; i++) + { + u32 tid; + if (i) + tid = worker_thread_first + (i - 1); + else + tid = i; + + xd->hqos_wt[tid].swq = xd->hqos_ht->swq[i]; + xd->hqos_wt[tid].hqos_field0_slabpos = hqos->pktfield0_slabpos; + xd->hqos_wt[tid].hqos_field0_slabmask = hqos->pktfield0_slabmask; + xd->hqos_wt[tid].hqos_field0_slabshr = + __builtin_ctzll (hqos->pktfield0_slabmask); + xd->hqos_wt[tid].hqos_field1_slabpos = hqos->pktfield1_slabpos; + xd->hqos_wt[tid].hqos_field1_slabmask = hqos->pktfield1_slabmask; + xd->hqos_wt[tid].hqos_field1_slabshr = + __builtin_ctzll (hqos->pktfield1_slabmask); + xd->hqos_wt[tid].hqos_field2_slabpos = hqos->pktfield2_slabpos; + xd->hqos_wt[tid].hqos_field2_slabmask = hqos->pktfield2_slabmask; + xd->hqos_wt[tid].hqos_field2_slabshr = + __builtin_ctzll (hqos->pktfield2_slabmask); + memcpy (xd->hqos_wt[tid].hqos_tc_table, hqos->tc_table, + sizeof (hqos->tc_table)); + } + + return 0; +} + +/*** + * + * HQoS run-time + * + ***/ +/* + * dpdk_hqos_thread - Contains the main loop of an HQoS thread. + * + * w + * Information for the current thread + */ +static_always_inline void +dpdk_hqos_thread_internal_hqos_dbg_bypass (vlib_main_t * vm) +{ + dpdk_main_t *dm = &dpdk_main; + u32 thread_index = vm->thread_index; + u32 dev_pos; + + dev_pos = 0; + while (1) + { + vlib_worker_thread_barrier_check (); + + u32 n_devs = vec_len (dm->devices_by_hqos_cpu[thread_index]); + if (dev_pos >= n_devs) + dev_pos = 0; + + dpdk_device_and_queue_t *dq = + vec_elt_at_index (dm->devices_by_hqos_cpu[thread_index], dev_pos); + dpdk_device_t *xd = vec_elt_at_index (dm->devices, dq->device); + + dpdk_device_hqos_per_hqos_thread_t *hqos = xd->hqos_ht; + u32 device_index = xd->device_index; + u16 queue_id = dq->queue_id; + + struct rte_mbuf **pkts_enq = hqos->pkts_enq; + u32 pkts_enq_len = hqos->pkts_enq_len; + u32 swq_pos = hqos->swq_pos; + u32 n_swq = vec_len (hqos->swq), i; + u32 flush_count = hqos->flush_count; + + for (i = 0; i < n_swq; i++) + { + /* Get current SWQ for this device */ + struct rte_ring *swq = hqos->swq[swq_pos]; + + /* Read SWQ burst to packet buffer of this device */ + pkts_enq_len += rte_ring_sc_dequeue_burst (swq, + (void **) + &pkts_enq[pkts_enq_len], + hqos->hqos_burst_enq, 0); + + /* Get next SWQ for this device */ + swq_pos++; + if (swq_pos >= n_swq) + swq_pos = 0; + hqos->swq_pos = swq_pos; + + /* HWQ TX enqueue when burst available */ + if (pkts_enq_len >= hqos->hqos_burst_enq) + { + u32 n_pkts = rte_eth_tx_burst (device_index, + (uint16_t) queue_id, + pkts_enq, + (uint16_t) pkts_enq_len); + + for (; n_pkts < pkts_enq_len; n_pkts++) + rte_pktmbuf_free (pkts_enq[n_pkts]); + + pkts_enq_len = 0; + flush_count = 0; + break; + } + } + if (pkts_enq_len) + { + flush_count++; + if (PREDICT_FALSE (flush_count == HQOS_FLUSH_COUNT_THRESHOLD)) + { + rte_sched_port_enqueue (hqos->hqos, pkts_enq, pkts_enq_len); + + pkts_enq_len = 0; + flush_count = 0; + } + } + hqos->pkts_enq_len = pkts_enq_len; + hqos->flush_count = flush_count; + + /* Advance to next device */ + dev_pos++; + } +} + +static_always_inline void +dpdk_hqos_thread_internal (vlib_main_t * vm) +{ + dpdk_main_t *dm = &dpdk_main; + u32 thread_index = vm->thread_index; + u32 dev_pos; + + dev_pos = 0; + while (1) + { + vlib_worker_thread_barrier_check (); + + u32 n_devs = vec_len (dm->devices_by_hqos_cpu[thread_index]); + if (PREDICT_FALSE (n_devs == 0)) + { + dev_pos = 0; + continue; + } + if (dev_pos >= n_devs) + dev_pos = 0; + + dpdk_device_and_queue_t *dq = + vec_elt_at_index (dm->devices_by_hqos_cpu[thread_index], dev_pos); + dpdk_device_t *xd = vec_elt_at_index (dm->devices, dq->device); + + dpdk_device_hqos_per_hqos_thread_t *hqos = xd->hqos_ht; + u32 device_index = xd->device_index; + u16 queue_id = dq->queue_id; + + struct rte_mbuf **pkts_enq = hqos->pkts_enq; + struct rte_mbuf **pkts_deq = hqos->pkts_deq; + u32 pkts_enq_len = hqos->pkts_enq_len; + u32 swq_pos = hqos->swq_pos; + u32 n_swq = vec_len (hqos->swq), i; + u32 flush_count = hqos->flush_count; + + /* + * SWQ dequeue and HQoS enqueue for current device + */ + for (i = 0; i < n_swq; i++) + { + /* Get current SWQ for this device */ + struct rte_ring *swq = hqos->swq[swq_pos]; + + /* Read SWQ burst to packet buffer of this device */ + pkts_enq_len += rte_ring_sc_dequeue_burst (swq, + (void **) + &pkts_enq[pkts_enq_len], + hqos->hqos_burst_enq, 0); + + /* Get next SWQ for this device */ + swq_pos++; + if (swq_pos >= n_swq) + swq_pos = 0; + hqos->swq_pos = swq_pos; + + /* HQoS enqueue when burst available */ + if (pkts_enq_len >= hqos->hqos_burst_enq) + { + rte_sched_port_enqueue (hqos->hqos, pkts_enq, pkts_enq_len); + + pkts_enq_len = 0; + flush_count = 0; + break; + } + } + if (pkts_enq_len) + { + flush_count++; + if (PREDICT_FALSE (flush_count == HQOS_FLUSH_COUNT_THRESHOLD)) + { + rte_sched_port_enqueue (hqos->hqos, pkts_enq, pkts_enq_len); + + pkts_enq_len = 0; + flush_count = 0; + } + } + hqos->pkts_enq_len = pkts_enq_len; + hqos->flush_count = flush_count; + + /* + * HQoS dequeue and HWQ TX enqueue for current device + */ + { + u32 pkts_deq_len, n_pkts; + + pkts_deq_len = rte_sched_port_dequeue (hqos->hqos, + pkts_deq, + hqos->hqos_burst_deq); + + for (n_pkts = 0; n_pkts < pkts_deq_len;) + n_pkts += rte_eth_tx_burst (device_index, + (uint16_t) queue_id, + &pkts_deq[n_pkts], + (uint16_t) (pkts_deq_len - n_pkts)); + } + + /* Advance to next device */ + dev_pos++; + } +} + +void +dpdk_hqos_thread (vlib_worker_thread_t * w) +{ + vlib_main_t *vm; + vlib_thread_main_t *tm = vlib_get_thread_main (); + dpdk_main_t *dm = &dpdk_main; + + vm = vlib_get_main (); + + ASSERT (vm->thread_index == vlib_get_thread_index ()); + + clib_time_init (&vm->clib_time); + clib_mem_set_heap (w->thread_mheap); + + /* Wait until the dpdk init sequence is complete */ + while (tm->worker_thread_release == 0) + vlib_worker_thread_barrier_check (); + + if (vec_len (dm->devices_by_hqos_cpu[vm->thread_index]) == 0) + return + clib_error + ("current I/O TX thread does not have any devices assigned to it"); + + if (DPDK_HQOS_DBG_BYPASS) + dpdk_hqos_thread_internal_hqos_dbg_bypass (vm); + else + dpdk_hqos_thread_internal (vm); +} + +void +dpdk_hqos_thread_fn (void *arg) +{ + vlib_worker_thread_t *w = (vlib_worker_thread_t *) arg; + vlib_worker_thread_init (w); + dpdk_hqos_thread (w); +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_THREAD (hqos_thread_reg, static) = +{ + .name = "hqos-threads", + .short_name = "hqos-threads", + .function = dpdk_hqos_thread_fn, +}; +/* *INDENT-ON* */ + +/* + * HQoS run-time code to be called by the worker threads + */ +#define BITFIELD(byte_array, slab_pos, slab_mask, slab_shr) \ +({ \ + u64 slab = *((u64 *) &byte_array[slab_pos]); \ + u64 val = (rte_be_to_cpu_64(slab) & slab_mask) >> slab_shr; \ + val; \ +}) + +#define RTE_SCHED_PORT_HIERARCHY(subport, pipe, traffic_class, queue, color) \ + ((((u64) (queue)) & 0x3) | \ + ((((u64) (traffic_class)) & 0x3) << 2) | \ + ((((u64) (color)) & 0x3) << 4) | \ + ((((u64) (subport)) & 0xFFFF) << 16) | \ + ((((u64) (pipe)) & 0xFFFFFFFF) << 32)) + +void +dpdk_hqos_metadata_set (dpdk_device_hqos_per_worker_thread_t * hqos, + struct rte_mbuf **pkts, u32 n_pkts) +{ + u32 i; + + for (i = 0; i < (n_pkts & (~0x3)); i += 4) + { + struct rte_mbuf *pkt0 = pkts[i]; + struct rte_mbuf *pkt1 = pkts[i + 1]; + struct rte_mbuf *pkt2 = pkts[i + 2]; + struct rte_mbuf *pkt3 = pkts[i + 3]; + + u8 *pkt0_data = rte_pktmbuf_mtod (pkt0, u8 *); + u8 *pkt1_data = rte_pktmbuf_mtod (pkt1, u8 *); + u8 *pkt2_data = rte_pktmbuf_mtod (pkt2, u8 *); + u8 *pkt3_data = rte_pktmbuf_mtod (pkt3, u8 *); + + u64 pkt0_subport = BITFIELD (pkt0_data, hqos->hqos_field0_slabpos, + hqos->hqos_field0_slabmask, + hqos->hqos_field0_slabshr); + u64 pkt0_pipe = BITFIELD (pkt0_data, hqos->hqos_field1_slabpos, + hqos->hqos_field1_slabmask, + hqos->hqos_field1_slabshr); + u64 pkt0_dscp = BITFIELD (pkt0_data, hqos->hqos_field2_slabpos, + hqos->hqos_field2_slabmask, + hqos->hqos_field2_slabshr); + u32 pkt0_tc = hqos->hqos_tc_table[pkt0_dscp & 0x3F] >> 2; + u32 pkt0_tc_q = hqos->hqos_tc_table[pkt0_dscp & 0x3F] & 0x3; + + u64 pkt1_subport = BITFIELD (pkt1_data, hqos->hqos_field0_slabpos, + hqos->hqos_field0_slabmask, + hqos->hqos_field0_slabshr); + u64 pkt1_pipe = BITFIELD (pkt1_data, hqos->hqos_field1_slabpos, + hqos->hqos_field1_slabmask, + hqos->hqos_field1_slabshr); + u64 pkt1_dscp = BITFIELD (pkt1_data, hqos->hqos_field2_slabpos, + hqos->hqos_field2_slabmask, + hqos->hqos_field2_slabshr); + u32 pkt1_tc = hqos->hqos_tc_table[pkt1_dscp & 0x3F] >> 2; + u32 pkt1_tc_q = hqos->hqos_tc_table[pkt1_dscp & 0x3F] & 0x3; + + u64 pkt2_subport = BITFIELD (pkt2_data, hqos->hqos_field0_slabpos, + hqos->hqos_field0_slabmask, + hqos->hqos_field0_slabshr); + u64 pkt2_pipe = BITFIELD (pkt2_data, hqos->hqos_field1_slabpos, + hqos->hqos_field1_slabmask, + hqos->hqos_field1_slabshr); + u64 pkt2_dscp = BITFIELD (pkt2_data, hqos->hqos_field2_slabpos, + hqos->hqos_field2_slabmask, + hqos->hqos_field2_slabshr); + u32 pkt2_tc = hqos->hqos_tc_table[pkt2_dscp & 0x3F] >> 2; + u32 pkt2_tc_q = hqos->hqos_tc_table[pkt2_dscp & 0x3F] & 0x3; + + u64 pkt3_subport = BITFIELD (pkt3_data, hqos->hqos_field0_slabpos, + hqos->hqos_field0_slabmask, + hqos->hqos_field0_slabshr); + u64 pkt3_pipe = BITFIELD (pkt3_data, hqos->hqos_field1_slabpos, + hqos->hqos_field1_slabmask, + hqos->hqos_field1_slabshr); + u64 pkt3_dscp = BITFIELD (pkt3_data, hqos->hqos_field2_slabpos, + hqos->hqos_field2_slabmask, + hqos->hqos_field2_slabshr); + u32 pkt3_tc = hqos->hqos_tc_table[pkt3_dscp & 0x3F] >> 2; + u32 pkt3_tc_q = hqos->hqos_tc_table[pkt3_dscp & 0x3F] & 0x3; + + u64 pkt0_sched = RTE_SCHED_PORT_HIERARCHY (pkt0_subport, + pkt0_pipe, + pkt0_tc, + pkt0_tc_q, + 0); + u64 pkt1_sched = RTE_SCHED_PORT_HIERARCHY (pkt1_subport, + pkt1_pipe, + pkt1_tc, + pkt1_tc_q, + 0); + u64 pkt2_sched = RTE_SCHED_PORT_HIERARCHY (pkt2_subport, + pkt2_pipe, + pkt2_tc, + pkt2_tc_q, + 0); + u64 pkt3_sched = RTE_SCHED_PORT_HIERARCHY (pkt3_subport, + pkt3_pipe, + pkt3_tc, + pkt3_tc_q, + 0); + + pkt0->hash.sched.lo = pkt0_sched & 0xFFFFFFFF; + pkt0->hash.sched.hi = pkt0_sched >> 32; + pkt1->hash.sched.lo = pkt1_sched & 0xFFFFFFFF; + pkt1->hash.sched.hi = pkt1_sched >> 32; + pkt2->hash.sched.lo = pkt2_sched & 0xFFFFFFFF; + pkt2->hash.sched.hi = pkt2_sched >> 32; + pkt3->hash.sched.lo = pkt3_sched & 0xFFFFFFFF; + pkt3->hash.sched.hi = pkt3_sched >> 32; + } + + for (; i < n_pkts; i++) + { + struct rte_mbuf *pkt = pkts[i]; + + u8 *pkt_data = rte_pktmbuf_mtod (pkt, u8 *); + + u64 pkt_subport = BITFIELD (pkt_data, hqos->hqos_field0_slabpos, + hqos->hqos_field0_slabmask, + hqos->hqos_field0_slabshr); + u64 pkt_pipe = BITFIELD (pkt_data, hqos->hqos_field1_slabpos, + hqos->hqos_field1_slabmask, + hqos->hqos_field1_slabshr); + u64 pkt_dscp = BITFIELD (pkt_data, hqos->hqos_field2_slabpos, + hqos->hqos_field2_slabmask, + hqos->hqos_field2_slabshr); + u32 pkt_tc = hqos->hqos_tc_table[pkt_dscp & 0x3F] >> 2; + u32 pkt_tc_q = hqos->hqos_tc_table[pkt_dscp & 0x3F] & 0x3; + + u64 pkt_sched = RTE_SCHED_PORT_HIERARCHY (pkt_subport, + pkt_pipe, + pkt_tc, + pkt_tc_q, + 0); + + pkt->hash.sched.lo = pkt_sched & 0xFFFFFFFF; + pkt->hash.sched.hi = pkt_sched >> 32; + } +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/dpdk/hqos/qos_doc.md b/src/plugins/dpdk/hqos/qos_doc.md new file mode 100644 index 00000000..7c064246 --- /dev/null +++ b/src/plugins/dpdk/hqos/qos_doc.md @@ -0,0 +1,411 @@ +# QoS Hierarchical Scheduler {#qos_doc} + +The Quality-of-Service (QoS) scheduler performs egress-traffic management by +prioritizing the transmission of the packets of different type services and +subcribers based on the Service Level Agreements (SLAs). The QoS scheduler can +be enabled on one or more NIC output interfaces depending upon the +requirement. + + +## Overview + +The QoS schdeuler supports a number of scheduling and shaping levels which +construct hierarchical-tree. The first level in the hierarchy is port (i.e. +the physical interface) that constitutes the root node of the tree. The +subsequent level is subport which represents the group of the +users/subscribers. The individual user/subscriber is represented by the pipe +at the next level. Each user can have different traffic type based on the +criteria of specific loss rate, jitter, and latency. These traffic types are +represented at the traffic-class level in the form of different traffic- +classes. The last level contains number of queues which are grouped together +to host the packets of the specific class type traffic. + +The QoS scheduler implementation requires flow classification, enqueue and +dequeue operations. The flow classification is mandatory stage for HQoS where +incoming packets are classified by mapping the packet fields information to +5-tuple (HQoS subport, pipe, traffic class, queue within traffic class, and +color) and storing that information in mbuf sched field. The enqueue operation +uses this information to determine the queue for storing the packet, and at +this stage, if the specific queue is full, QoS drops the packet. The dequeue +operation consists of scheduling the packet based on its length and available +credits, and handing over the scheduled packet to the output interface. + +For more information on QoS Scheduler, please refer DPDK Programmer's Guide- +http://dpdk.org/doc/guides/prog_guide/qos_framework.html + + +### QoS Schdeuler Parameters + +Following illustrates the default HQoS configuration for each 10GbE output +port: + +Single subport (subport 0): + - Subport rate set to 100% of port rate + - Each of the 4 traffic classes has rate set to 100% of port rate + +4K pipes per subport 0 (pipes 0 .. 4095) with identical configuration: + - Pipe rate set to 1/4K of port rate + - Each of the 4 traffic classes has rate set to 100% of pipe rate + - Within each traffic class, the byte-level WRR weights for the 4 queues are set to 1:1:1:1 + + +#### Port configuration + +``` +port { + rate 1250000000 /* Assuming 10GbE port */ + frame_overhead 24 /* Overhead fields per Ethernet frame: + * 7B (Preamble) + + * 1B (Start of Frame Delimiter (SFD)) + + * 4B (Frame Check Sequence (FCS)) + + * 12B (Inter Frame Gap (IFG)) + */ + mtu 1522 /* Assuming Ethernet/IPv4 pkt (FCS not included) */ + n_subports_per_port 1 /* Number of subports per output interface */ + n_pipes_per_subport 4096 /* Number of pipes (users/subscribers) */ + queue_sizes 64 64 64 64 /* Packet queue size for each traffic class. + * All queues within the same pipe traffic class + * have the same size. Queues from different + * pipes serving the same traffic class have + * the same size. */ +} +``` + + +#### Subport configuration + +``` +subport 0 { + tb_rate 1250000000 /* Subport level token bucket rate (bytes per second) */ + tb_size 1000000 /* Subport level token bucket size (bytes) */ + tc0_rate 1250000000 /* Subport level token bucket rate for traffic class 0 (bytes per second) */ + tc1_rate 1250000000 /* Subport level token bucket rate for traffic class 1 (bytes per second) */ + tc2_rate 1250000000 /* Subport level token bucket rate for traffic class 2 (bytes per second) */ + tc3_rate 1250000000 /* Subport level token bucket rate for traffic class 3 (bytes per second) */ + tc_period 10 /* Time interval for refilling the token bucket associated with traffic class (Milliseconds) */ + pipe 0 4095 profile 0 /* pipes (users/subscribers) configured with pipe profile 0 */ +} +``` + + +#### Pipe configuration + +``` +pipe_profile 0 { + tb_rate 305175 /* Pipe level token bucket rate (bytes per second) */ + tb_size 1000000 /* Pipe level token bucket size (bytes) */ + tc0_rate 305175 /* Pipe level token bucket rate for traffic class 0 (bytes per second) */ + tc1_rate 305175 /* Pipe level token bucket rate for traffic class 1 (bytes per second) */ + tc2_rate 305175 /* Pipe level token bucket rate for traffic class 2 (bytes per second) */ + tc3_rate 305175 /* Pipe level token bucket rate for traffic class 3 (bytes per second) */ + tc_period 40 /* Time interval for refilling the token bucket associated with traffic class at pipe level (Milliseconds) */ + tc3_oversubscription_weight 1 /* Weight traffic class 3 oversubscription */ + tc0_wrr_weights 1 1 1 1 /* Pipe queues WRR weights for traffic class 0 */ + tc1_wrr_weights 1 1 1 1 /* Pipe queues WRR weights for traffic class 1 */ + tc2_wrr_weights 1 1 1 1 /* Pipe queues WRR weights for traffic class 2 */ + tc3_wrr_weights 1 1 1 1 /* Pipe queues WRR weights for traffic class 3 */ +} +``` + + +#### Random Early Detection (RED) parameters per traffic class and color (Green / Yellow / Red) + +``` +red { + tc0_wred_min 48 40 32 /* Minimum threshold for traffic class 0 queue (min_th) in number of packets */ + tc0_wred_max 64 64 64 /* Maximum threshold for traffic class 0 queue (max_th) in number of packets */ + tc0_wred_inv_prob 10 10 10 /* Inverse of packet marking probability for traffic class 0 queue (maxp = 1 / maxp_inv) */ + tc0_wred_weight 9 9 9 /* Traffic Class 0 queue weight */ + tc1_wred_min 48 40 32 /* Minimum threshold for traffic class 1 queue (min_th) in number of packets */ + tc1_wred_max 64 64 64 /* Maximum threshold for traffic class 1 queue (max_th) in number of packets */ + tc1_wred_inv_prob 10 10 10 /* Inverse of packet marking probability for traffic class 1 queue (maxp = 1 / maxp_inv) */ + tc1_wred_weight 9 9 9 /* Traffic Class 1 queue weight */ + tc2_wred_min 48 40 32 /* Minimum threshold for traffic class 2 queue (min_th) in number of packets */ + tc2_wred_max 64 64 64 /* Maximum threshold for traffic class 2 queue (max_th) in number of packets */ + tc2_wred_inv_prob 10 10 10 /* Inverse of packet marking probability for traffic class 2 queue (maxp = 1 / maxp_inv) */ + tc2_wred_weight 9 9 9 /* Traffic Class 2 queue weight */ + tc3_wred_min 48 40 32 /* Minimum threshold for traffic class 3 queue (min_th) in number of packets */ + tc3_wred_max 64 64 64 /* Maximum threshold for traffic class 3 queue (max_th) in number of packets */ + tc3_wred_inv_prob 10 10 10 /* Inverse of packet marking probability for traffic class 3 queue (maxp = 1 / maxp_inv) */ + tc3_wred_weight 9 9 9 /* Traffic Class 3 queue weight */ +} +``` + + +### DPDK QoS Scheduler Integration in VPP + +The Hierarchical Quaity-of-Service (HQoS) scheduler object could be seen as +part of the logical NIC output interface. To enable HQoS on specific output +interface, vpp startup.conf file has to be configured accordingly. The output +interface that requires HQoS, should have "hqos" parameter specified in dpdk +section. Another optional parameter "hqos-thread" has been defined which can +be used to associate the output interface with specific hqos thread. In cpu +section of the config file, "corelist-hqos-threads" is introduced to assign +logical cpu cores to run the HQoS threads. A HQoS thread can run multiple HQoS +objects each associated with different output interfaces. All worker threads +instead of writing packets to NIC TX queue directly, write the packets to a +software queues. The hqos_threads read the software queues, and enqueue the +packets to HQoS objects, as well as dequeue packets from HQOS objects and +write them to NIC output interfaces. The worker threads need to be able to +send the packets to any output interface, therefore, each HQoS object +associated with NIC output interface should have software queues equal to +worker threads count. + +Following illustrates the sample startup configuration file with 4x worker +threads feeding 2x hqos threads that handle each QoS scheduler for 1x output +interface. + +``` +dpdk { + socket-mem 16384,16384 + + dev 0000:02:00.0 { + num-rx-queues 2 + hqos + } + dev 0000:06:00.0 { + num-rx-queues 2 + hqos + } + + num-mbufs 1000000 +} + +cpu { + main-core 0 + corelist-workers 1, 2, 3, 4 + corelist-hqos-threads 5, 6 +} +``` + + +### QoS scheduler CLI Commands + +Each QoS scheduler instance is initialised with default parameters required to +configure hqos port, subport, pipe and queues. Some of the parameters can be +re-configured in run-time through CLI commands. + + +#### Configuration + +Following commands can be used to configure QoS scheduler parameters. + +The command below can be used to set the subport level parameters such as +token bucket rate (bytes per seconds), token bucket size (bytes), traffic +class rates (bytes per seconds) and token update period (Milliseconds). + +``` +set dpdk interface hqos subport <interface> subport <subport_id> [rate <n>] + [bktsize <n>] [tc0 <n>] [tc1 <n>] [tc2 <n>] [tc3 <n>] [period <n>] +``` + +For setting the pipe profile, following command can be used. + +``` +set dpdk interface hqos pipe <interface> subport <subport_id> pipe <pipe_id> + profile <profile_id> +``` + +To assign QoS scheduler instance to the specific thread, following command can +be used. + +``` +set dpdk interface hqos placement <interface> thread <n> +``` + +The command below is used to set the packet fields required for classifiying +the incoming packet. As a result of classification process, packet field +information will be mapped to 5 tuples (subport, pipe, traffic class, pipe, +color) and stored in packet mbuf. + +``` +set dpdk interface hqos pktfield <interface> id subport|pipe|tc offset <n> + mask <hex-mask> +``` + +The DSCP table entries used for idenfiying the traffic class and queue can be set using the command below; + +``` +set dpdk interface hqos tctbl <interface> entry <map_val> tc <tc_id> queue <queue_id> +``` + + +#### Show Command + +The QoS Scheduler configuration can displayed using the command below. + +``` + vpp# show dpdk interface hqos TenGigabitEthernet2/0/0 + Thread: + Input SWQ size = 4096 packets + Enqueue burst size = 256 packets + Dequeue burst size = 220 packets + Packet field 0: slab position = 0, slab bitmask = 0x0000000000000000 (subport) + Packet field 1: slab position = 40, slab bitmask = 0x0000000fff000000 (pipe) + Packet field 2: slab position = 8, slab bitmask = 0x00000000000000fc (tc) + Packet field 2 tc translation table: ([Mapped Value Range]: tc/queue tc/queue ...) + [ 0 .. 15]: 0/0 0/1 0/2 0/3 1/0 1/1 1/2 1/3 2/0 2/1 2/2 2/3 3/0 3/1 3/2 3/3 + [16 .. 31]: 0/0 0/1 0/2 0/3 1/0 1/1 1/2 1/3 2/0 2/1 2/2 2/3 3/0 3/1 3/2 3/3 + [32 .. 47]: 0/0 0/1 0/2 0/3 1/0 1/1 1/2 1/3 2/0 2/1 2/2 2/3 3/0 3/1 3/2 3/3 + [48 .. 63]: 0/0 0/1 0/2 0/3 1/0 1/1 1/2 1/3 2/0 2/1 2/2 2/3 3/0 3/1 3/2 3/3 + Port: + Rate = 1250000000 bytes/second + MTU = 1514 bytes + Frame overhead = 24 bytes + Number of subports = 1 + Number of pipes per subport = 4096 + Packet queue size: TC0 = 64, TC1 = 64, TC2 = 64, TC3 = 64 packets + Number of pipe profiles = 1 + Subport 0: + Rate = 120000000 bytes/second + Token bucket size = 1000000 bytes + Traffic class rate: TC0 = 120000000, TC1 = 120000000, TC2 = 120000000, TC3 = 120000000 bytes/second + TC period = 10 milliseconds + Pipe profile 0: + Rate = 305175 bytes/second + Token bucket size = 1000000 bytes + Traffic class rate: TC0 = 305175, TC1 = 305175, TC2 = 305175, TC3 = 305175 bytes/second + TC period = 40 milliseconds + TC0 WRR weights: Q0 = 1, Q1 = 1, Q2 = 1, Q3 = 1 + TC1 WRR weights: Q0 = 1, Q1 = 1, Q2 = 1, Q3 = 1 + TC2 WRR weights: Q0 = 1, Q1 = 1, Q2 = 1, Q3 = 1 + TC3 WRR weights: Q0 = 1, Q1 = 1, Q2 = 1, Q3 = 1 +``` + +The QoS Scheduler placement over the logical cpu cores can be displayed using +below command. + +``` + vpp# show dpdk interface hqos placement + Thread 5 (vpp_hqos-threads_0 at lcore 5): + TenGigabitEthernet2/0/0 queue 0 + Thread 6 (vpp_hqos-threads_1 at lcore 6): + TenGigabitEthernet4/0/1 queue 0 +``` + + +### QoS Scheduler Binary APIs + +This section explans the available binary APIs for configuring QoS scheduler +parameters in run-time. + +The following API can be used to set the pipe profile of a pipe that belongs +to a given subport: + +``` +sw_interface_set_dpdk_hqos_pipe rx <intfc> | sw_if_index <id> + subport <subport-id> pipe <pipe-id> profile <profile-id> +``` + +The data structures used for set the pipe profile parameter are as follows; + +``` + /** \\brief DPDK interface HQoS pipe profile set request + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param sw_if_index - the interface + @param subport - subport ID + @param pipe - pipe ID within its subport + @param profile - pipe profile ID + */ + define sw_interface_set_dpdk_hqos_pipe { + u32 client_index; + u32 context; + u32 sw_if_index; + u32 subport; + u32 pipe; + u32 profile; + }; + + /** \\brief DPDK interface HQoS pipe profile set reply + @param context - sender context, to match reply w/ request + @param retval - request return code + */ + define sw_interface_set_dpdk_hqos_pipe_reply { + u32 context; + i32 retval; + }; +``` + +The following API can be used to set the subport level parameters, for +example- token bucket rate (bytes per seconds), token bucket size (bytes), +traffic class rate (bytes per seconds) and tokens update period. + +``` +sw_interface_set_dpdk_hqos_subport rx <intfc> | sw_if_index <id> + subport <subport-id> [rate <n>] [bktsize <n>] + [tc0 <n>] [tc1 <n>] [tc2 <n>] [tc3 <n>] [period <n>] +``` + +The data structures used for set the subport level parameter are as follows; + +``` + /** \\brief DPDK interface HQoS subport parameters set request + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param sw_if_index - the interface + @param subport - subport ID + @param tb_rate - subport token bucket rate (measured in bytes/second) + @param tb_size - subport token bucket size (measured in credits) + @param tc_rate - subport traffic class 0 .. 3 rates (measured in bytes/second) + @param tc_period - enforcement period for rates (measured in milliseconds) + */ + define sw_interface_set_dpdk_hqos_subport { + u32 client_index; + u32 context; + u32 sw_if_index; + u32 subport; + u32 tb_rate; + u32 tb_size; + u32 tc_rate[4]; + u32 tc_period; + }; + + /** \\brief DPDK interface HQoS subport parameters set reply + @param context - sender context, to match reply w/ request + @param retval - request return code + */ + define sw_interface_set_dpdk_hqos_subport_reply { + u32 context; + i32 retval; + }; +``` + +The following API can be used set the DSCP table entry. The DSCP table have +64 entries to map the packet DSCP field onto traffic class and hqos input +queue. + +``` +sw_interface_set_dpdk_hqos_tctbl rx <intfc> | sw_if_index <id> + entry <n> tc <n> queue <n> +``` + +The data structures used for setting DSCP table entries are given below. + +``` + /** \\brief DPDK interface HQoS tctbl entry set request + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param sw_if_index - the interface + @param entry - entry index ID + @param tc - traffic class (0 .. 3) + @param queue - traffic class queue (0 .. 3) + */ + define sw_interface_set_dpdk_hqos_tctbl { + u32 client_index; + u32 context; + u32 sw_if_index; + u32 entry; + u32 tc; + u32 queue; + }; + + /** \\brief DPDK interface HQoS tctbl entry set reply + @param context - sender context, to match reply w/ request + @param retval - request return code + */ + define sw_interface_set_dpdk_hqos_tctbl_reply { + u32 context; + i32 retval; + }; +``` diff --git a/src/plugins/dpdk/ipsec/cli.c b/src/plugins/dpdk/ipsec/cli.c new file mode 100644 index 00000000..a9cf2502 --- /dev/null +++ b/src/plugins/dpdk/ipsec/cli.c @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2016 Intel and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vnet/vnet.h> +#include <dpdk/device/dpdk.h> +#include <dpdk/ipsec/ipsec.h> + +static void +dpdk_ipsec_show_mapping (vlib_main_t * vm, u16 detail_display) +{ + dpdk_crypto_main_t *dcm = &dpdk_crypto_main; + vlib_thread_main_t *tm = vlib_get_thread_main (); + u32 i, skip_master; + + if (!dcm->enabled) + { + vlib_cli_output (vm, "DPDK Cryptodev support is disabled\n"); + return; + } + + if (detail_display) + vlib_cli_output (vm, "worker\t%10s\t%15s\tdir\tdev\tqp\n", + "cipher", "auth"); + else + vlib_cli_output (vm, "worker\tcrypto device id(type)\n"); + + skip_master = vlib_num_workers () > 0; + + for (i = 0; i < tm->n_vlib_mains; i++) + { + uword key, data; + u32 thread_index = vlib_mains[i]->thread_index; + crypto_worker_main_t *cwm = &dcm->workers_main[thread_index]; + u8 *s = 0; + + if (skip_master) + { + skip_master = 0; + continue; + } + + if (!detail_display) + { + i32 last_cdev = -1; + crypto_qp_data_t *qpd; + + s = format (s, "%u\t", thread_index); + + /* *INDENT-OFF* */ + vec_foreach (qpd, cwm->qp_data) + { + u32 dev_id = qpd->dev_id; + + if ((u16) last_cdev != dev_id) + { + struct rte_cryptodev_info cdev_info; + + rte_cryptodev_info_get (dev_id, &cdev_info); + + s = format(s, "%u(%s)\t", dev_id, cdev_info.feature_flags & + RTE_CRYPTODEV_FF_HW_ACCELERATED ? "HW" : "SW"); + } + last_cdev = dev_id; + } + /* *INDENT-ON* */ + vlib_cli_output (vm, "%s", s); + } + else + { + char cipher_str[15], auth_str[15]; + struct rte_cryptodev_capabilities cap; + crypto_worker_qp_key_t *p_key = (crypto_worker_qp_key_t *) & key; + /* *INDENT-OFF* */ + hash_foreach (key, data, cwm->algo_qp_map, + ({ + cap.op = RTE_CRYPTO_OP_TYPE_SYMMETRIC; +#if DPDK_NO_AEAD + cap.sym.xform_type = RTE_CRYPTO_SYM_XFORM_CIPHER; + cap.sym.cipher.algo = p_key->cipher_algo; +#else + if (p_key->is_aead) + { + cap.sym.xform_type = RTE_CRYPTO_SYM_XFORM_AEAD; + cap.sym.aead.algo = p_key->cipher_algo; + } + else + { + cap.sym.xform_type = RTE_CRYPTO_SYM_XFORM_CIPHER; + cap.sym.cipher.algo = p_key->cipher_algo; + } +#endif + check_algo_is_supported (&cap, cipher_str); + + cap.op = RTE_CRYPTO_OP_TYPE_SYMMETRIC; + cap.sym.xform_type = RTE_CRYPTO_SYM_XFORM_AUTH; + cap.sym.auth.algo = p_key->auth_algo; + check_algo_is_supported (&cap, auth_str); + + vlib_cli_output (vm, "%u\t%10s\t%15s\t%3s\t%u\t%u\n", + vlib_mains[i]->thread_index, cipher_str, auth_str, + p_key->is_outbound ? "out" : "in", + cwm->qp_data[data].dev_id, + cwm->qp_data[data].qp_id); + })); + /* *INDENT-ON* */ + } + } +} + +static clib_error_t * +lcore_cryptodev_map_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + u16 detail = 0; + clib_error_t *error = NULL; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "verbose")) + detail = 1; + else + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } + } + + dpdk_ipsec_show_mapping (vm, detail); + +done: + unformat_free (line_input); + + return error; +} + +/*? + * This command is used to display the DPDK Crypto device data. See + * @ref dpdk_crypto_ipsec_doc for more details on initializing the + * DPDK Crypto device. + * + * @cliexpar + * Example of displaying the DPDK Crypto device data when disabled: + * @cliexstart{show crypto device mapping} + * DPDK Cryptodev support is disabled + * @cliexend + * Example of displaying the DPDK Crypto device data when enabled: + * @cliexstart{show crypto device mapping} + * worker crypto device id(type) + * 1 1(SW) + * 2 1(SW) + * @cliexend + * Example of displaying the DPDK Crypto device data when enabled with verbose: + * @cliexstart{show crypto device mapping verbose} + * worker cipher auth dir dev qp + * 1 AES_CTR AES-XCBC-MAC in 1 0 + * 1 AES_CTR HMAC-SHA384 in 1 0 + * 1 AES_CTR HMAC-SHA384 out 1 1 + * 1 AES_CBC HMAC-SHA512 in 1 0 + * 1 AES_CBC HMAC-SHA256 in 1 0 + * 1 AES_CBC AES-XCBC-MAC out 1 1 + * 1 AES_CTR AES-XCBC-MAC out 1 1 + * 1 AES_CBC HMAC-SHA256 out 1 1 + * 1 AES_CTR HMAC-SHA512 out 1 1 + * 1 AES_CTR HMAC-SHA256 in 1 0 + * 1 AES_CTR HMAC-SHA1 in 1 0 + * 1 AES_CBC HMAC-SHA512 out 1 1 + * 1 AES_CBC HMAC-SHA384 out 1 1 + * 1 AES_CTR HMAC-SHA1 out 1 1 + * 1 AES_CTR HMAC-SHA256 out 1 1 + * 1 AES_CBC HMAC-SHA1 in 1 0 + * 1 AES_CBC AES-XCBC-MAC in 1 0 + * 1 AES_CTR HMAC-SHA512 in 1 0 + * 1 AES_CBC HMAC-SHA1 out 1 1 + * 1 AES_CBC HMAC-SHA384 in 1 0 + * 2 AES_CTR AES-XCBC-MAC in 1 2 + * 2 AES_CTR HMAC-SHA384 in 1 2 + * 2 AES_CTR HMAC-SHA384 out 1 3 + * 2 AES_CBC HMAC-SHA512 in 1 2 + * 2 AES_CBC HMAC-SHA256 in 1 2 + * 2 AES_CBC AES-XCBC-MAC out 1 3 + * 2 AES_CTR AES-XCBC-MAC out 1 3 + * 2 AES_CBC HMAC-SHA256 out 1 3 + * 2 AES_CTR HMAC-SHA512 out 1 3 + * 2 AES_CTR HMAC-SHA256 in 1 2 + * 2 AES_CTR HMAC-SHA1 in 1 2 + * 2 AES_CBC HMAC-SHA512 out 1 3 + * 2 AES_CBC HMAC-SHA384 out 1 3 + * 2 AES_CTR HMAC-SHA1 out 1 3 + * 2 AES_CTR HMAC-SHA256 out 1 3 + * 2 AES_CBC HMAC-SHA1 in 1 2 + * 2 AES_CBC AES-XCBC-MAC in 1 2 + * 2 AES_CTR HMAC-SHA512 in 1 2 + * 2 AES_CBC HMAC-SHA1 out 1 3 + * 2 AES_CBC HMAC-SHA384 in 1 2 + * @cliexend +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (lcore_cryptodev_map, static) = { + .path = "show crypto device mapping", + .short_help = + "show cryptodev device mapping [verbose]", + .function = lcore_cryptodev_map_fn, +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/dpdk/ipsec/crypto_node.c b/src/plugins/dpdk/ipsec/crypto_node.c new file mode 100644 index 00000000..a3c45902 --- /dev/null +++ b/src/plugins/dpdk/ipsec/crypto_node.c @@ -0,0 +1,215 @@ +/* + *------------------------------------------------------------------ + * crypto_node.c - DPDK Cryptodev input node + * + * Copyright (c) 2016 Intel and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#include <vlib/vlib.h> +#include <vnet/ip/ip.h> +#include <vnet/ethernet/ethernet.h> +#include <vnet/ipsec/ipsec.h> + +#include <dpdk/device/dpdk.h> +#include <dpdk/device/dpdk_priv.h> +#include <dpdk/ipsec/ipsec.h> + +#define foreach_dpdk_crypto_input_next \ + _(DROP, "error-drop") \ + _(ENCRYPT_POST, "dpdk-esp-encrypt-post") \ + _(DECRYPT_POST, "dpdk-esp-decrypt-post") + +typedef enum +{ +#define _(f,s) DPDK_CRYPTO_INPUT_NEXT_##f, + foreach_dpdk_crypto_input_next +#undef _ + DPDK_CRYPTO_INPUT_N_NEXT, +} dpdk_crypto_input_next_t; + +#define foreach_dpdk_crypto_input_error \ + _(DQ_COPS, "Crypto ops dequeued") \ + _(COP_FAILED, "Crypto op failed") + +typedef enum +{ +#define _(f,s) DPDK_CRYPTO_INPUT_ERROR_##f, + foreach_dpdk_crypto_input_error +#undef _ + DPDK_CRYPTO_INPUT_N_ERROR, +} dpdk_crypto_input_error_t; + +static char *dpdk_crypto_input_error_strings[] = { +#define _(n, s) s, + foreach_dpdk_crypto_input_error +#undef _ +}; + +vlib_node_registration_t dpdk_crypto_input_node; + +typedef struct +{ + u32 cdev; + u32 qp; + u32 status; + u32 sa_idx; + u32 next_index; +} dpdk_crypto_input_trace_t; + +static u8 * +format_dpdk_crypto_input_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + dpdk_crypto_input_trace_t *t = va_arg (*args, dpdk_crypto_input_trace_t *); + + s = format (s, "dpdk_crypto: cryptodev-id %u queue-pair %u next-index %d", + t->cdev, t->qp, t->next_index); + + s = format (s, " status %u sa-idx %u\n", t->status, t->sa_idx); + + return s; +} + +static_always_inline u32 +dpdk_crypto_dequeue (vlib_main_t * vm, vlib_node_runtime_t * node, + crypto_qp_data_t * qpd) +{ + u32 n_deq, *to_next = 0, next_index, n_cops, def_next_index; + struct rte_crypto_op **cops = qpd->cops; + + if (qpd->inflights == 0) + return 0; + + if (qpd->is_outbound) + def_next_index = DPDK_CRYPTO_INPUT_NEXT_ENCRYPT_POST; + else + def_next_index = DPDK_CRYPTO_INPUT_NEXT_DECRYPT_POST; + + n_cops = rte_cryptodev_dequeue_burst (qpd->dev_id, qpd->qp_id, + cops, VLIB_FRAME_SIZE); + n_deq = n_cops; + next_index = def_next_index; + + qpd->inflights -= n_cops; + ASSERT (qpd->inflights >= 0); + + while (n_cops > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_cops > 0 && n_left_to_next > 0) + { + u32 bi0, next0; + vlib_buffer_t *b0 = 0; + struct rte_crypto_op *cop; + struct rte_crypto_sym_op *sym_cop; + + cop = cops[0]; + cops += 1; + n_cops -= 1; + n_left_to_next -= 1; + + next0 = def_next_index; + + if (PREDICT_FALSE (cop->status != RTE_CRYPTO_OP_STATUS_SUCCESS)) + { + next0 = DPDK_CRYPTO_INPUT_NEXT_DROP; + vlib_node_increment_counter (vm, dpdk_crypto_input_node.index, + DPDK_CRYPTO_INPUT_ERROR_COP_FAILED, + 1); + } + cop->status = RTE_CRYPTO_OP_STATUS_NOT_PROCESSED; + + sym_cop = (struct rte_crypto_sym_op *) (cop + 1); + b0 = vlib_buffer_from_rte_mbuf (sym_cop->m_src); + bi0 = vlib_get_buffer_index (vm, b0); + + to_next[0] = bi0; + to_next += 1; + + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + vlib_trace_next_frame (vm, node, next0); + dpdk_crypto_input_trace_t *tr = + vlib_add_trace (vm, node, b0, sizeof (*tr)); + tr->cdev = qpd->dev_id; + tr->qp = qpd->qp_id; + tr->status = cop->status; + tr->next_index = next0; + tr->sa_idx = vnet_buffer (b0)->ipsec.sad_index; + } + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + crypto_free_cop (qpd, qpd->cops, n_deq); + + vlib_node_increment_counter (vm, dpdk_crypto_input_node.index, + DPDK_CRYPTO_INPUT_ERROR_DQ_COPS, n_deq); + return n_deq; +} + +static uword +dpdk_crypto_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + u32 thread_index = vlib_get_thread_index (); + dpdk_crypto_main_t *dcm = &dpdk_crypto_main; + crypto_worker_main_t *cwm = &dcm->workers_main[thread_index]; + crypto_qp_data_t *qpd; + u32 n_deq = 0; + + /* *INDENT-OFF* */ + vec_foreach (qpd, cwm->qp_data) + n_deq += dpdk_crypto_dequeue(vm, node, qpd); + /* *INDENT-ON* */ + + return n_deq; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (dpdk_crypto_input_node) = +{ + .function = dpdk_crypto_input_fn, + .name = "dpdk-crypto-input", + .format_trace = format_dpdk_crypto_input_trace, + .type = VLIB_NODE_TYPE_INPUT, + .state = VLIB_NODE_STATE_DISABLED, + .n_errors = DPDK_CRYPTO_INPUT_N_ERROR, + .error_strings = dpdk_crypto_input_error_strings, + .n_next_nodes = DPDK_CRYPTO_INPUT_N_NEXT, + .next_nodes = + { +#define _(s,n) [DPDK_CRYPTO_INPUT_NEXT_##s] = n, + foreach_dpdk_crypto_input_next +#undef _ + }, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (dpdk_crypto_input_node, dpdk_crypto_input_fn) +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/dpdk/ipsec/dir.dox b/src/plugins/dpdk/ipsec/dir.dox new file mode 100644 index 00000000..05504541 --- /dev/null +++ b/src/plugins/dpdk/ipsec/dir.dox @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2016 Intel and/or its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Doxygen directory documentation */ + +/** +@dir src/plugins/dpdk/ipsec +@brief IPSec ESP encrypt/decrypt using DPDK Cryptodev API. + +This directory contains the source code for the DPDK Crypto abstraction layer. + +*/ +/*? %%clicmd:group_label DPDK Crypto %% ?*/ +/*? %%syscfg:group_label DPDK Crypto %% ?*/ diff --git a/src/plugins/dpdk/ipsec/dpdk_crypto_ipsec_doc.md b/src/plugins/dpdk/ipsec/dpdk_crypto_ipsec_doc.md new file mode 100644 index 00000000..8ea77a6c --- /dev/null +++ b/src/plugins/dpdk/ipsec/dpdk_crypto_ipsec_doc.md @@ -0,0 +1,87 @@ +# VPP IPSec implementation using DPDK Cryptodev API {#dpdk_crypto_ipsec_doc} + +This document is meant to contain all related information about implementation and usability. + + +## VPP IPsec with DPDK Cryptodev + +DPDK Cryptodev is an asynchronous crypto API that supports both Hardware and Software implementations (for more details refer to [DPDK Cryptography Device Library documentation](http://dpdk.org/doc/guides/prog_guide/cryptodev_lib.html)). + +When there are enough Cryptodev resources for all workers, the node graph is reconfigured by adding and changing the default next nodes. + +The following nodes are added: +* dpdk-crypto-input : polling input node, dequeuing from crypto devices. +* dpdk-esp-encrypt : internal node. +* dpdk-esp-decrypt : internal node. +* dpdk-esp-encrypt-post : internal node. +* dpdk-esp-decrypt-post : internal node. + +Set new default next nodes: +* for esp encryption: esp-encrypt -> dpdk-esp-encrypt +* for esp decryption: esp-decrypt -> dpdk-esp-decrypt + + +### How to enable VPP IPSec with DPDK Cryptodev support + +When building DPDK with VPP, Cryptodev support is always enabled. + +Additionally, on x86_64 platforms, DPDK is built with SW crypto support. + + +### Crypto Resources allocation + +VPP allocates crypto resources based on a best effort approach: +* first allocate Hardware crypto resources, then Software. +* if there are not enough crypto resources for all workers, the graph node is not modifed and the default VPP IPsec implementation based in OpenSSL is used. The following message is displayed: + + 0: dpdk_ipsec_init: not enough Cryptodevs, default to OpenSSL IPsec + + +### Configuration example + +To enable DPDK Cryptodev the user just need to provide cryptodevs in the startup.conf. + +Below is an example startup.conf, it is not meant to be a default configuration: + +``` +dpdk { + dev 0000:81:00.0 + dev 0000:81:00.1 + dev 0000:85:01.0 + dev 0000:85:01.1 + vdev crypto_aesni_mb0,socket_id=1 + vdev crypto_aesni_mb1,socket_id=1 +} +``` + +In the above configuration: +* 0000:81:01.0 and 0000:81:01.1 are Ethernet device BDFs. +* 0000:85:01.0 and 0000:85:01.1 are Crypto device BDFs and they require the same driver binding as DPDK Ethernet devices but they do not support any extra configuration options. +* Two AESNI-MB Software (Virtual) Cryptodev PMDs are created in NUMA node 1. + +For further details refer to [DPDK Crypto Device Driver documentation](http://dpdk.org/doc/guides/cryptodevs/index.html) + +### Operational data + +The following CLI command displays the Cryptodev/Worker mapping: + + show crypto device mapping [verbose] + + +### nasm + +Building the DPDK Crypto Libraries requires the open source project nasm (The Netwide +Assembler) to be installed. Recommended version of nasm is 2.12.02. Minimum supported +version of nasm is 2.11.06. Use the following command to determine the current nasm version: + + nasm -v + +CentOS 7.3 and earlier and Fedora 21 and earlier use unsupported versions +of nasm. Use the following set of commands to build a supported version: + + wget http://www.nasm.us/pub/nasm/releasebuilds/2.12.02/nasm-2.12.02.tar.bz2 + tar -xjvf nasm-2.12.02.tar.bz2 + cd nasm-2.12.02/ + ./configure + make + sudo make install diff --git a/src/plugins/dpdk/ipsec/esp.h b/src/plugins/dpdk/ipsec/esp.h new file mode 100644 index 00000000..51224d6c --- /dev/null +++ b/src/plugins/dpdk/ipsec/esp.h @@ -0,0 +1,399 @@ +/* + * Copyright (c) 2016 Intel and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __DPDK_ESP_H__ +#define __DPDK_ESP_H__ + +#include <dpdk/ipsec/ipsec.h> +#include <vnet/ipsec/ipsec.h> +#include <vnet/ipsec/esp.h> + +typedef struct +{ + enum rte_crypto_cipher_algorithm algo; +#if ! DPDK_NO_AEAD + enum rte_crypto_aead_algorithm aead_algo; +#endif + u8 key_len; + u8 iv_len; +} dpdk_esp_crypto_alg_t; + +typedef struct +{ + enum rte_crypto_auth_algorithm algo; + u8 trunc_size; +} dpdk_esp_integ_alg_t; + +typedef struct +{ + dpdk_esp_crypto_alg_t *esp_crypto_algs; + dpdk_esp_integ_alg_t *esp_integ_algs; +} dpdk_esp_main_t; + +dpdk_esp_main_t dpdk_esp_main; + +static_always_inline void +dpdk_esp_init () +{ + dpdk_esp_main_t *em = &dpdk_esp_main; + dpdk_esp_integ_alg_t *i; + dpdk_esp_crypto_alg_t *c; + + vec_validate (em->esp_crypto_algs, IPSEC_CRYPTO_N_ALG - 1); + + c = &em->esp_crypto_algs[IPSEC_CRYPTO_ALG_AES_CBC_128]; + c->algo = RTE_CRYPTO_CIPHER_AES_CBC; + c->key_len = 16; + c->iv_len = 16; + + c = &em->esp_crypto_algs[IPSEC_CRYPTO_ALG_AES_CBC_192]; + c->algo = RTE_CRYPTO_CIPHER_AES_CBC; + c->key_len = 24; + c->iv_len = 16; + + c = &em->esp_crypto_algs[IPSEC_CRYPTO_ALG_AES_CBC_256]; + c->algo = RTE_CRYPTO_CIPHER_AES_CBC; + c->key_len = 32; + c->iv_len = 16; + + c = &em->esp_crypto_algs[IPSEC_CRYPTO_ALG_AES_GCM_128]; +#if DPDK_NO_AEAD + c->algo = RTE_CRYPTO_CIPHER_AES_GCM; +#else + c->aead_algo = RTE_CRYPTO_AEAD_AES_GCM; +#endif + c->key_len = 16; + c->iv_len = 8; + + vec_validate (em->esp_integ_algs, IPSEC_INTEG_N_ALG - 1); + + i = &em->esp_integ_algs[IPSEC_INTEG_ALG_SHA1_96]; + i->algo = RTE_CRYPTO_AUTH_SHA1_HMAC; + i->trunc_size = 12; + + i = &em->esp_integ_algs[IPSEC_INTEG_ALG_SHA_256_96]; + i->algo = RTE_CRYPTO_AUTH_SHA256_HMAC; + i->trunc_size = 12; + + i = &em->esp_integ_algs[IPSEC_INTEG_ALG_SHA_256_128]; + i->algo = RTE_CRYPTO_AUTH_SHA256_HMAC; + i->trunc_size = 16; + + i = &em->esp_integ_algs[IPSEC_INTEG_ALG_SHA_384_192]; + i->algo = RTE_CRYPTO_AUTH_SHA384_HMAC; + i->trunc_size = 24; + + i = &em->esp_integ_algs[IPSEC_INTEG_ALG_SHA_512_256]; + i->algo = RTE_CRYPTO_AUTH_SHA512_HMAC; + i->trunc_size = 32; +#if DPDK_NO_AEAD + i = &em->esp_integ_algs[IPSEC_INTEG_ALG_AES_GCM_128]; + i->algo = RTE_CRYPTO_AUTH_AES_GCM; + i->trunc_size = 16; +#endif +} + +static_always_inline int +translate_crypto_algo (ipsec_crypto_alg_t crypto_algo, + struct rte_crypto_sym_xform *xform, u8 use_esn) +{ +#if ! DPDK_NO_AEAD + const u16 iv_off = + sizeof (struct rte_crypto_op) + sizeof (struct rte_crypto_sym_op) + + offsetof (dpdk_cop_priv_t, cb); +#endif + + xform->type = RTE_CRYPTO_SYM_XFORM_CIPHER; + + switch (crypto_algo) + { + case IPSEC_CRYPTO_ALG_NONE: +#if ! DPDK_NO_AEAD + xform->cipher.iv.offset = iv_off; + xform->cipher.iv.length = 0; +#endif + xform->cipher.algo = RTE_CRYPTO_CIPHER_NULL; + break; + case IPSEC_CRYPTO_ALG_AES_CBC_128: + case IPSEC_CRYPTO_ALG_AES_CBC_192: + case IPSEC_CRYPTO_ALG_AES_CBC_256: +#if ! DPDK_NO_AEAD + xform->cipher.iv.offset = iv_off; + xform->cipher.iv.length = 16; +#endif + xform->cipher.algo = RTE_CRYPTO_CIPHER_AES_CBC; + break; + case IPSEC_CRYPTO_ALG_AES_GCM_128: +#if DPDK_NO_AEAD + xform->cipher.algo = RTE_CRYPTO_CIPHER_AES_GCM; +#else + xform->type = RTE_CRYPTO_SYM_XFORM_AEAD; + xform->aead.algo = RTE_CRYPTO_AEAD_AES_GCM; + xform->aead.iv.offset = iv_off; + xform->aead.iv.length = 12; /* GCM IV, not ESP IV */ + xform->aead.digest_length = 16; + xform->aead.aad_length = use_esn ? 12 : 8; +#endif + break; + default: + return -1; + } + + return 0; +} + +static_always_inline int +translate_integ_algo (ipsec_integ_alg_t integ_alg, + struct rte_crypto_sym_xform *auth_xform, u8 use_esn) +{ + auth_xform->type = RTE_CRYPTO_SYM_XFORM_AUTH; + + switch (integ_alg) + { + case IPSEC_INTEG_ALG_NONE: + auth_xform->auth.algo = RTE_CRYPTO_AUTH_NULL; + auth_xform->auth.digest_length = 0; + break; + case IPSEC_INTEG_ALG_SHA1_96: + auth_xform->auth.algo = RTE_CRYPTO_AUTH_SHA1_HMAC; + auth_xform->auth.digest_length = 12; + break; + case IPSEC_INTEG_ALG_SHA_256_96: + auth_xform->auth.algo = RTE_CRYPTO_AUTH_SHA256_HMAC; + auth_xform->auth.digest_length = 12; + break; + case IPSEC_INTEG_ALG_SHA_256_128: + auth_xform->auth.algo = RTE_CRYPTO_AUTH_SHA256_HMAC; + auth_xform->auth.digest_length = 16; + break; + case IPSEC_INTEG_ALG_SHA_384_192: + auth_xform->auth.algo = RTE_CRYPTO_AUTH_SHA384_HMAC; + auth_xform->auth.digest_length = 24; + break; + case IPSEC_INTEG_ALG_SHA_512_256: + auth_xform->auth.algo = RTE_CRYPTO_AUTH_SHA512_HMAC; + auth_xform->auth.digest_length = 32; + break; +#if DPDK_NO_AEAD + case IPSEC_INTEG_ALG_AES_GCM_128: + auth_xform->auth.algo = RTE_CRYPTO_AUTH_AES_GCM; + auth_xform->auth.digest_length = 16; + auth_xform->auth.add_auth_data_length = use_esn ? 12 : 8; + break; +#endif + default: + return -1; + } + + return 0; +} + +static_always_inline i32 +create_sym_sess (ipsec_sa_t * sa, crypto_sa_session_t * sa_sess, + u8 is_outbound) +{ + u32 thread_index = vlib_get_thread_index (); + dpdk_crypto_main_t *dcm = &dpdk_crypto_main; + crypto_worker_main_t *cwm = &dcm->workers_main[thread_index]; + struct rte_crypto_sym_xform cipher_xform = { 0 }; + struct rte_crypto_sym_xform auth_xform = { 0 }; + struct rte_crypto_sym_xform *xfs; + uword key = 0, *data; + crypto_worker_qp_key_t *p_key = (crypto_worker_qp_key_t *) & key; +#if ! DPDK_NO_AEAD + i32 socket_id = rte_socket_id (); + i32 ret; +#endif + + if (sa->crypto_alg == IPSEC_CRYPTO_ALG_AES_GCM_128) + { + sa->crypto_key_len -= 4; + clib_memcpy (&sa->salt, &sa->crypto_key[sa->crypto_key_len], 4); + } + else + { + u32 seed = (u32) clib_cpu_time_now (); + sa->salt = random_u32 (&seed); + } + + if (translate_crypto_algo (sa->crypto_alg, &cipher_xform, sa->use_esn) < 0) + return -1; + p_key->cipher_algo = cipher_xform.cipher.algo; + + if (translate_integ_algo (sa->integ_alg, &auth_xform, sa->use_esn) < 0) + return -1; + p_key->auth_algo = auth_xform.auth.algo; + +#if ! DPDK_NO_AEAD + if (sa->crypto_alg == IPSEC_CRYPTO_ALG_AES_GCM_128) + { + cipher_xform.aead.key.data = sa->crypto_key; + cipher_xform.aead.key.length = sa->crypto_key_len; + + if (is_outbound) + cipher_xform.cipher.op = + (enum rte_crypto_cipher_operation) RTE_CRYPTO_AEAD_OP_ENCRYPT; + else + cipher_xform.cipher.op = + (enum rte_crypto_cipher_operation) RTE_CRYPTO_AEAD_OP_DECRYPT; + cipher_xform.next = NULL; + xfs = &cipher_xform; + p_key->is_aead = 1; + } + else /* Cipher + Auth */ +#endif + { + cipher_xform.cipher.key.data = sa->crypto_key; + cipher_xform.cipher.key.length = sa->crypto_key_len; + + auth_xform.auth.key.data = sa->integ_key; + auth_xform.auth.key.length = sa->integ_key_len; + + if (is_outbound) + { + cipher_xform.cipher.op = RTE_CRYPTO_CIPHER_OP_ENCRYPT; + auth_xform.auth.op = RTE_CRYPTO_AUTH_OP_GENERATE; + cipher_xform.next = &auth_xform; + xfs = &cipher_xform; + } + else + { + cipher_xform.cipher.op = RTE_CRYPTO_CIPHER_OP_DECRYPT; + auth_xform.auth.op = RTE_CRYPTO_AUTH_OP_VERIFY; + auth_xform.next = &cipher_xform; + xfs = &auth_xform; + } + p_key->is_aead = 0; + } + + p_key->is_outbound = is_outbound; + + data = hash_get (cwm->algo_qp_map, key); + if (!data) + return -1; + +#if DPDK_NO_AEAD + sa_sess->sess = + rte_cryptodev_sym_session_create (cwm->qp_data[*data].dev_id, xfs); + if (!sa_sess->sess) + return -1; +#else + sa_sess->sess = + rte_cryptodev_sym_session_create (dcm->sess_h_pools[socket_id]); + if (!sa_sess->sess) + return -1; + + ret = + rte_cryptodev_sym_session_init (cwm->qp_data[*data].dev_id, sa_sess->sess, + xfs, dcm->sess_pools[socket_id]); + if (ret) + return -1; +#endif + + sa_sess->qp_index = (u8) * data; + + return 0; +} + +static_always_inline void +crypto_set_icb (dpdk_gcm_cnt_blk * icb, u32 salt, u32 seq, u32 seq_hi) +{ + icb->salt = salt; + icb->iv[0] = seq; + icb->iv[1] = seq_hi; +#if DPDK_NO_AEAD + icb->cnt = clib_host_to_net_u32 (1); +#endif +} + +#define __unused __attribute__((unused)) +static_always_inline void +crypto_op_setup (u8 is_aead, struct rte_mbuf *mb0, + struct rte_crypto_op *cop, void *session, + u32 cipher_off, u32 cipher_len, + u8 * icb __unused, u32 iv_size __unused, + u32 auth_off, u32 auth_len, + u8 * aad __unused, u32 aad_size __unused, + u8 * digest, u64 digest_paddr, u32 digest_size __unused) +{ + struct rte_crypto_sym_op *sym_cop; + + sym_cop = (struct rte_crypto_sym_op *) (cop + 1); + + sym_cop->m_src = mb0; + rte_crypto_op_attach_sym_session (cop, session); + +#if DPDK_NO_AEAD + sym_cop->cipher.data.offset = cipher_off; + sym_cop->cipher.data.length = cipher_len; + + sym_cop->cipher.iv.data = icb; + sym_cop->cipher.iv.phys_addr = + cop->phys_addr + (uintptr_t) icb - (uintptr_t) cop; + sym_cop->cipher.iv.length = iv_size; + + if (is_aead) + { + sym_cop->auth.aad.data = aad; + sym_cop->auth.aad.phys_addr = + cop->phys_addr + (uintptr_t) aad - (uintptr_t) cop; + sym_cop->auth.aad.length = aad_size; + } + else + { + sym_cop->auth.data.offset = auth_off; + sym_cop->auth.data.length = auth_len; + } + + sym_cop->auth.digest.data = digest; + sym_cop->auth.digest.phys_addr = digest_paddr; + sym_cop->auth.digest.length = digest_size; +#else /* ! DPDK_NO_AEAD */ + if (is_aead) + { + sym_cop->aead.data.offset = cipher_off; + sym_cop->aead.data.length = cipher_len; + + sym_cop->aead.aad.data = aad; + sym_cop->aead.aad.phys_addr = + cop->phys_addr + (uintptr_t) aad - (uintptr_t) cop; + + sym_cop->aead.digest.data = digest; + sym_cop->aead.digest.phys_addr = digest_paddr; + } + else + { + sym_cop->cipher.data.offset = cipher_off; + sym_cop->cipher.data.length = cipher_len; + + sym_cop->auth.data.offset = auth_off; + sym_cop->auth.data.length = auth_len; + + sym_cop->auth.digest.data = digest; + sym_cop->auth.digest.phys_addr = digest_paddr; + } +#endif /* DPDK_NO_AEAD */ +} + +#undef __unused + +#endif /* __DPDK_ESP_H__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/dpdk/ipsec/esp_decrypt.c b/src/plugins/dpdk/ipsec/esp_decrypt.c new file mode 100644 index 00000000..20936b36 --- /dev/null +++ b/src/plugins/dpdk/ipsec/esp_decrypt.c @@ -0,0 +1,569 @@ +/* + * esp_decrypt.c : IPSec ESP Decrypt node using DPDK Cryptodev + * + * Copyright (c) 2016 Intel and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vnet/vnet.h> +#include <vnet/api_errno.h> +#include <vnet/ip/ip.h> + +#include <vnet/ipsec/ipsec.h> +#include <dpdk/ipsec/ipsec.h> +#include <dpdk/ipsec/esp.h> +#include <dpdk/device/dpdk.h> +#include <dpdk/device/dpdk_priv.h> + +#define foreach_esp_decrypt_next \ +_(DROP, "error-drop") \ +_(IP4_INPUT, "ip4-input") \ +_(IP6_INPUT, "ip6-input") + +#define _(v, s) ESP_DECRYPT_NEXT_##v, +typedef enum { + foreach_esp_decrypt_next +#undef _ + ESP_DECRYPT_N_NEXT, +} esp_decrypt_next_t; + +#define foreach_esp_decrypt_error \ + _(RX_PKTS, "ESP pkts received") \ + _(DECRYPTION_FAILED, "ESP decryption failed") \ + _(REPLAY, "SA replayed packet") \ + _(NOT_IP, "Not IP packet (dropped)") \ + _(ENQ_FAIL, "Enqueue failed (buffer full)") \ + _(NO_CRYPTODEV, "Cryptodev not configured") \ + _(BAD_LEN, "Invalid ciphertext length") + + +typedef enum { +#define _(sym,str) ESP_DECRYPT_ERROR_##sym, + foreach_esp_decrypt_error +#undef _ + ESP_DECRYPT_N_ERROR, +} esp_decrypt_error_t; + +static char * esp_decrypt_error_strings[] = { +#define _(sym,string) string, + foreach_esp_decrypt_error +#undef _ +}; + +vlib_node_registration_t dpdk_esp_decrypt_node; + +typedef struct { + ipsec_crypto_alg_t crypto_alg; + ipsec_integ_alg_t integ_alg; +} esp_decrypt_trace_t; + +/* packet trace format function */ +static u8 * format_esp_decrypt_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + esp_decrypt_trace_t * t = va_arg (*args, esp_decrypt_trace_t *); + + s = format (s, "esp: crypto %U integrity %U", + format_ipsec_crypto_alg, t->crypto_alg, + format_ipsec_integ_alg, t->integ_alg); + return s; +} + +static uword +dpdk_esp_decrypt_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + u32 n_left_from, *from, *to_next, next_index; + ipsec_main_t *im = &ipsec_main; + u32 thread_index = vlib_get_thread_index(); + dpdk_crypto_main_t * dcm = &dpdk_crypto_main; + dpdk_esp_main_t * em = &dpdk_esp_main; + u32 i; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + crypto_worker_main_t *cwm = + vec_elt_at_index(dcm->workers_main, thread_index); + u32 n_qps = vec_len(cwm->qp_data); + struct rte_crypto_op ** cops_to_enq[n_qps]; + u32 n_cop_qp[n_qps], * bi_to_enq[n_qps]; + + for (i = 0; i < n_qps; i++) + { + bi_to_enq[i] = cwm->qp_data[i].bi; + cops_to_enq[i] = cwm->qp_data[i].cops; + } + + memset(n_cop_qp, 0, n_qps * sizeof(u32)); + + crypto_alloc_cops(); + + next_index = ESP_DECRYPT_NEXT_DROP; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0, sa_index0 = ~0, seq, trunc_size, iv_size; + vlib_buffer_t * b0; + esp_header_t * esp0; + ipsec_sa_t * sa0; + struct rte_mbuf * mb0 = 0; + const int BLOCK_SIZE = 16; + crypto_sa_session_t * sa_sess; + void * sess; + u16 qp_index; + struct rte_crypto_op * cop = 0; + + bi0 = from[0]; + from += 1; + n_left_from -= 1; + + b0 = vlib_get_buffer (vm, bi0); + esp0 = vlib_buffer_get_current (b0); + + sa_index0 = vnet_buffer(b0)->ipsec.sad_index; + sa0 = pool_elt_at_index (im->sad, sa_index0); + + seq = clib_host_to_net_u32(esp0->seq); + + /* anti-replay check */ + if (sa0->use_anti_replay) + { + int rv = 0; + + if (PREDICT_TRUE(sa0->use_esn)) + rv = esp_replay_check_esn(sa0, seq); + else + rv = esp_replay_check(sa0, seq); + + if (PREDICT_FALSE(rv)) + { + clib_warning ("anti-replay SPI %u seq %u", sa0->spi, seq); + vlib_node_increment_counter (vm, dpdk_esp_decrypt_node.index, + ESP_DECRYPT_ERROR_REPLAY, 1); + to_next[0] = bi0; + to_next += 1; + n_left_to_next -= 1; + goto trace; + } + } + + sa0->total_data_size += b0->current_length; + + sa_sess = pool_elt_at_index(cwm->sa_sess_d[0], sa_index0); + + if (PREDICT_FALSE(!sa_sess->sess)) + { + int ret = create_sym_sess(sa0, sa_sess, 0); + + if (PREDICT_FALSE (ret)) + { + to_next[0] = bi0; + to_next += 1; + n_left_to_next -= 1; + goto trace; + } + } + + sess = sa_sess->sess; + qp_index = sa_sess->qp_index; + + ASSERT (vec_len (vec_elt (cwm->qp_data, qp_index).free_cops) > 0); + cop = vec_pop (vec_elt (cwm->qp_data, qp_index).free_cops); + ASSERT (cop->status == RTE_CRYPTO_OP_STATUS_NOT_PROCESSED); + + cops_to_enq[qp_index][0] = cop; + cops_to_enq[qp_index] += 1; + n_cop_qp[qp_index] += 1; + bi_to_enq[qp_index][0] = bi0; + bi_to_enq[qp_index] += 1; + + rte_crypto_op_attach_sym_session(cop, sess); + + if (sa0->crypto_alg == IPSEC_CRYPTO_ALG_AES_GCM_128) + trunc_size = 16; + else + trunc_size = em->esp_integ_algs[sa0->integ_alg].trunc_size; + iv_size = em->esp_crypto_algs[sa0->crypto_alg].iv_len; + + /* Convert vlib buffer to mbuf */ + mb0 = rte_mbuf_from_vlib_buffer(b0); + mb0->data_len = b0->current_length; + mb0->pkt_len = b0->current_length; + mb0->data_off = RTE_PKTMBUF_HEADROOM + b0->current_data; + + /* Outer IP header has already been stripped */ + u16 payload_len = rte_pktmbuf_pkt_len(mb0) - sizeof (esp_header_t) - + iv_size - trunc_size; + + if ((payload_len & (BLOCK_SIZE - 1)) || (payload_len <= 0)) + { + clib_warning ("payload %u not multiple of %d\n", + payload_len, BLOCK_SIZE); + vlib_node_increment_counter (vm, dpdk_esp_decrypt_node.index, + ESP_DECRYPT_ERROR_BAD_LEN, 1); + vec_add (vec_elt (cwm->qp_data, qp_index).free_cops, &cop, 1); + bi_to_enq[qp_index] -= 1; + cops_to_enq[qp_index] -= 1; + n_cop_qp[qp_index] -= 1; + to_next[0] = bi0; + to_next += 1; + n_left_to_next -= 1; + goto trace; + } + + struct rte_crypto_sym_op *sym_cop = (struct rte_crypto_sym_op *)(cop + 1); + + u8 is_aead = sa0->crypto_alg == IPSEC_CRYPTO_ALG_AES_GCM_128; + u32 cipher_off, cipher_len; + u32 auth_off = 0, auth_len = 0, aad_size = 0; + u8 *aad = NULL, *digest = NULL; + u64 digest_paddr; + + u8 *iv = rte_pktmbuf_mtod_offset(mb0, void*, sizeof (esp_header_t)); + dpdk_cop_priv_t *priv = (dpdk_cop_priv_t *)(sym_cop + 1); + dpdk_gcm_cnt_blk *icb = &priv->cb; + + cipher_off = sizeof (esp_header_t) + iv_size; + cipher_len = payload_len; + + digest = + vlib_buffer_get_current (b0) + sizeof(esp_header_t) + + iv_size + payload_len; + + digest_paddr = mb0->buf_physaddr + (digest - (u8 *) mb0->buf_addr); + + if (is_aead) + { + u32 *_iv = (u32 *) iv; + + crypto_set_icb (icb, sa0->salt, _iv[0], _iv[1]); + iv_size = 16; + + aad = priv->aad; + clib_memcpy(aad, esp0, 8); + aad_size = 8; + if (sa0->use_esn) + { + *((u32*)&aad[8]) = sa0->seq_hi; + aad_size = 12; + } + } + else + { + clib_memcpy(icb, iv, 16); + + auth_off = 0; + auth_len = sizeof(esp_header_t) + iv_size + payload_len; + + if (sa0->use_esn) + { + dpdk_cop_priv_t* priv = (dpdk_cop_priv_t*) (sym_cop + 1); + + clib_memcpy (priv->icv, digest, trunc_size); + *((u32*) digest) = sa0->seq_hi; + auth_len += sizeof(sa0->seq_hi); + + digest = priv->icv; + digest_paddr = + cop->phys_addr + (uintptr_t) priv->icv - (uintptr_t) cop; + } + } + + crypto_op_setup (is_aead, mb0, cop, sess, + cipher_off, cipher_len, (u8 *) icb, iv_size, + auth_off, auth_len, aad, aad_size, + digest, digest_paddr, trunc_size); +trace: + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + esp_decrypt_trace_t *tr = vlib_add_trace (vm, node, b0, sizeof (*tr)); + tr->crypto_alg = sa0->crypto_alg; + tr->integ_alg = sa0->integ_alg; + } + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + vlib_node_increment_counter (vm, dpdk_esp_decrypt_node.index, + ESP_DECRYPT_ERROR_RX_PKTS, + from_frame->n_vectors); + crypto_qp_data_t *qpd; + /* *INDENT-OFF* */ + vec_foreach_index (i, cwm->qp_data) + { + u32 enq; + + if (!n_cop_qp[i]) + continue; + + qpd = vec_elt_at_index(cwm->qp_data, i); + enq = rte_cryptodev_enqueue_burst(qpd->dev_id, qpd->qp_id, + qpd->cops, n_cop_qp[i]); + qpd->inflights += enq; + + if (PREDICT_FALSE(enq < n_cop_qp[i])) + { + crypto_free_cop (qpd, &qpd->cops[enq], n_cop_qp[i] - enq); + vlib_buffer_free (vm, &qpd->bi[enq], n_cop_qp[i] - enq); + + vlib_node_increment_counter (vm, dpdk_esp_decrypt_node.index, + ESP_DECRYPT_ERROR_ENQ_FAIL, + n_cop_qp[i] - enq); + } + } + /* *INDENT-ON* */ + + return from_frame->n_vectors; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (dpdk_esp_decrypt_node) = { + .function = dpdk_esp_decrypt_node_fn, + .name = "dpdk-esp-decrypt", + .vector_size = sizeof (u32), + .format_trace = format_esp_decrypt_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .n_errors = ARRAY_LEN(esp_decrypt_error_strings), + .error_strings = esp_decrypt_error_strings, + + .n_next_nodes = ESP_DECRYPT_N_NEXT, + .next_nodes = { +#define _(s,n) [ESP_DECRYPT_NEXT_##s] = n, + foreach_esp_decrypt_next +#undef _ + }, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (dpdk_esp_decrypt_node, dpdk_esp_decrypt_node_fn) + +/* + * Decrypt Post Node + */ + +#define foreach_esp_decrypt_post_error \ + _(PKTS, "ESP post pkts") + +typedef enum { +#define _(sym,str) ESP_DECRYPT_POST_ERROR_##sym, + foreach_esp_decrypt_post_error +#undef _ + ESP_DECRYPT_POST_N_ERROR, +} esp_decrypt_post_error_t; + +static char * esp_decrypt_post_error_strings[] = { +#define _(sym,string) string, + foreach_esp_decrypt_post_error +#undef _ +}; + +vlib_node_registration_t dpdk_esp_decrypt_post_node; + +static u8 * format_esp_decrypt_post_trace (u8 * s, va_list * args) +{ + return s; +} + +static uword +dpdk_esp_decrypt_post_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + u32 n_left_from, *from, *to_next = 0, next_index; + ipsec_sa_t * sa0; + u32 sa_index0 = ~0; + ipsec_main_t *im = &ipsec_main; + dpdk_esp_main_t *em = &dpdk_esp_main; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + esp_footer_t * f0; + u32 bi0, next0, trunc_size, iv_size; + vlib_buffer_t * b0 = 0; + ip4_header_t *ih4 = 0, *oh4 = 0; + ip6_header_t *ih6 = 0, *oh6 = 0; + u8 tunnel_mode = 1; + u8 transport_ip6 = 0; + + next0 = ESP_DECRYPT_NEXT_DROP; + + bi0 = from[0]; + from += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + sa_index0 = vnet_buffer(b0)->ipsec.sad_index; + sa0 = pool_elt_at_index (im->sad, sa_index0); + + to_next[0] = bi0; + to_next += 1; + + if (sa0->crypto_alg == IPSEC_CRYPTO_ALG_AES_GCM_128) + trunc_size = 16; + else + trunc_size = em->esp_integ_algs[sa0->integ_alg].trunc_size; + iv_size = em->esp_crypto_algs[sa0->crypto_alg].iv_len; + + if (sa0->use_anti_replay) + { + esp_header_t * esp0 = vlib_buffer_get_current (b0); + u32 seq; + seq = clib_host_to_net_u32(esp0->seq); + if (PREDICT_TRUE(sa0->use_esn)) + esp_replay_advance_esn(sa0, seq); + else + esp_replay_advance(sa0, seq); + } + + ih4 = (ip4_header_t *) (b0->data + sizeof(ethernet_header_t)); + vlib_buffer_advance (b0, sizeof (esp_header_t) + iv_size); + + b0->current_length -= (trunc_size + 2); + b0->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID; + f0 = (esp_footer_t *) ((u8 *) vlib_buffer_get_current (b0) + + b0->current_length); + b0->current_length -= f0->pad_length; + + /* transport mode */ + if (PREDICT_FALSE(!sa0->is_tunnel && !sa0->is_tunnel_ip6)) + { + tunnel_mode = 0; + + if (PREDICT_TRUE((ih4->ip_version_and_header_length & 0xF0) != 0x40)) + { + if (PREDICT_TRUE((ih4->ip_version_and_header_length & 0xF0) == 0x60)) + transport_ip6 = 1; + else + { + clib_warning("next header: 0x%x", f0->next_header); + vlib_node_increment_counter (vm, dpdk_esp_decrypt_node.index, + ESP_DECRYPT_ERROR_NOT_IP, 1); + goto trace; + } + } + } + + if (PREDICT_TRUE (tunnel_mode)) + { + if (PREDICT_TRUE(f0->next_header == IP_PROTOCOL_IP_IN_IP)) + next0 = ESP_DECRYPT_NEXT_IP4_INPUT; + else if (f0->next_header == IP_PROTOCOL_IPV6) + next0 = ESP_DECRYPT_NEXT_IP6_INPUT; + else + { + clib_warning("next header: 0x%x", f0->next_header); + vlib_node_increment_counter (vm, dpdk_esp_decrypt_node.index, + ESP_DECRYPT_ERROR_DECRYPTION_FAILED, + 1); + goto trace; + } + } + /* transport mode */ + else + { + if (PREDICT_FALSE(transport_ip6)) + { + ih6 = (ip6_header_t *) (b0->data + sizeof(ethernet_header_t)); + vlib_buffer_advance (b0, -sizeof(ip6_header_t)); + oh6 = vlib_buffer_get_current (b0); + memmove(oh6, ih6, sizeof(ip6_header_t)); + + next0 = ESP_DECRYPT_NEXT_IP6_INPUT; + oh6->protocol = f0->next_header; + oh6->payload_length = + clib_host_to_net_u16 ( + vlib_buffer_length_in_chain(vm, b0) - + sizeof (ip6_header_t)); + } + else + { + vlib_buffer_advance (b0, -sizeof(ip4_header_t)); + oh4 = vlib_buffer_get_current (b0); + memmove(oh4, ih4, sizeof(ip4_header_t)); + + next0 = ESP_DECRYPT_NEXT_IP4_INPUT; + oh4->ip_version_and_header_length = 0x45; + oh4->fragment_id = 0; + oh4->flags_and_fragment_offset = 0; + oh4->protocol = f0->next_header; + oh4->length = clib_host_to_net_u16 ( + vlib_buffer_length_in_chain (vm, b0)); + oh4->checksum = ip4_header_checksum (oh4); + } + } + + vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32)~0; + +trace: + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + esp_decrypt_trace_t *tr = vlib_add_trace (vm, node, b0, sizeof (*tr)); + tr->crypto_alg = sa0->crypto_alg; + tr->integ_alg = sa0->integ_alg; + } + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, bi0, next0); + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + vlib_node_increment_counter (vm, dpdk_esp_decrypt_post_node.index, + ESP_DECRYPT_POST_ERROR_PKTS, + from_frame->n_vectors); + + return from_frame->n_vectors; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (dpdk_esp_decrypt_post_node) = { + .function = dpdk_esp_decrypt_post_node_fn, + .name = "dpdk-esp-decrypt-post", + .vector_size = sizeof (u32), + .format_trace = format_esp_decrypt_post_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .n_errors = ARRAY_LEN(esp_decrypt_post_error_strings), + .error_strings = esp_decrypt_post_error_strings, + + .n_next_nodes = ESP_DECRYPT_N_NEXT, + .next_nodes = { +#define _(s,n) [ESP_DECRYPT_NEXT_##s] = n, + foreach_esp_decrypt_next +#undef _ + }, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (dpdk_esp_decrypt_post_node, dpdk_esp_decrypt_post_node_fn) diff --git a/src/plugins/dpdk/ipsec/esp_encrypt.c b/src/plugins/dpdk/ipsec/esp_encrypt.c new file mode 100644 index 00000000..b4e29e91 --- /dev/null +++ b/src/plugins/dpdk/ipsec/esp_encrypt.c @@ -0,0 +1,592 @@ +/* + * esp_encrypt.c : IPSec ESP encrypt node using DPDK Cryptodev + * + * Copyright (c) 2016 Intel and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vnet/vnet.h> +#include <vnet/api_errno.h> +#include <vnet/ip/ip.h> + +#include <vnet/ipsec/ipsec.h> +#include <dpdk/ipsec/ipsec.h> +#include <dpdk/ipsec/esp.h> +#include <dpdk/device/dpdk.h> +#include <dpdk/device/dpdk_priv.h> + +#define foreach_esp_encrypt_next \ +_(DROP, "error-drop") \ +_(IP4_LOOKUP, "ip4-lookup") \ +_(IP6_LOOKUP, "ip6-lookup") \ +_(INTERFACE_OUTPUT, "interface-output") + +#define _(v, s) ESP_ENCRYPT_NEXT_##v, +typedef enum +{ + foreach_esp_encrypt_next +#undef _ + ESP_ENCRYPT_N_NEXT, +} esp_encrypt_next_t; + +#define foreach_esp_encrypt_error \ + _(RX_PKTS, "ESP pkts received") \ + _(SEQ_CYCLED, "sequence number cycled") \ + _(ENQ_FAIL, "Enqueue failed (buffer full)") \ + _(NO_CRYPTODEV, "Cryptodev not configured") + + +typedef enum +{ +#define _(sym,str) ESP_ENCRYPT_ERROR_##sym, + foreach_esp_encrypt_error +#undef _ + ESP_ENCRYPT_N_ERROR, +} esp_encrypt_error_t; + +static char *esp_encrypt_error_strings[] = { +#define _(sym,string) string, + foreach_esp_encrypt_error +#undef _ +}; + +vlib_node_registration_t dpdk_esp_encrypt_node; + +typedef struct +{ + u32 spi; + u32 seq; + ipsec_crypto_alg_t crypto_alg; + ipsec_integ_alg_t integ_alg; +} esp_encrypt_trace_t; + +/* packet trace format function */ +static u8 * +format_esp_encrypt_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + esp_encrypt_trace_t *t = va_arg (*args, esp_encrypt_trace_t *); + + s = format (s, "esp: spi %u seq %u crypto %U integrity %U", + t->spi, t->seq, + format_ipsec_crypto_alg, t->crypto_alg, + format_ipsec_integ_alg, t->integ_alg); + return s; +} + +static uword +dpdk_esp_encrypt_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + u32 n_left_from, *from, *to_next, next_index; + ipsec_main_t *im = &ipsec_main; + u32 thread_index = vlib_get_thread_index (); + dpdk_crypto_main_t *dcm = &dpdk_crypto_main; + dpdk_esp_main_t *em = &dpdk_esp_main; + u32 i; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + crypto_worker_main_t *cwm = + vec_elt_at_index (dcm->workers_main, thread_index); + u32 n_qps = vec_len (cwm->qp_data); + struct rte_crypto_op **cops_to_enq[n_qps]; + u32 n_cop_qp[n_qps], *bi_to_enq[n_qps]; + + for (i = 0; i < n_qps; i++) + { + bi_to_enq[i] = cwm->qp_data[i].bi; + cops_to_enq[i] = cwm->qp_data[i].cops; + } + + memset (n_cop_qp, 0, n_qps * sizeof (u32)); + + crypto_alloc_cops (); + + next_index = ESP_ENCRYPT_NEXT_DROP; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0, next0; + vlib_buffer_t *b0 = 0; + u32 sa_index0; + ipsec_sa_t *sa0; + ip4_and_esp_header_t *ih0, *oh0 = 0; + ip6_and_esp_header_t *ih6_0, *oh6_0 = 0; + struct rte_mbuf *mb0 = 0; + esp_footer_t *f0; + u8 is_ipv6; + u8 ip_hdr_size; + u8 next_hdr_type; + u8 transport_mode = 0; + const int BLOCK_SIZE = 16; + u32 iv_size; + u16 orig_sz; + u8 trunc_size; + crypto_sa_session_t *sa_sess; + void *sess; + struct rte_crypto_op *cop = 0; + u16 qp_index; + + bi0 = from[0]; + from += 1; + n_left_from -= 1; + + b0 = vlib_get_buffer (vm, bi0); + sa_index0 = vnet_buffer (b0)->ipsec.sad_index; + sa0 = pool_elt_at_index (im->sad, sa_index0); + + if (PREDICT_FALSE (esp_seq_advance (sa0))) + { + clib_warning ("sequence number counter has cycled SPI %u", + sa0->spi); + vlib_node_increment_counter (vm, dpdk_esp_encrypt_node.index, + ESP_ENCRYPT_ERROR_SEQ_CYCLED, 1); + //TODO: rekey SA + to_next[0] = bi0; + to_next += 1; + n_left_to_next -= 1; + goto trace; + } + + sa0->total_data_size += b0->current_length; + + sa_sess = pool_elt_at_index (cwm->sa_sess_d[1], sa_index0); + if (PREDICT_FALSE (!sa_sess->sess)) + { + int ret = create_sym_sess (sa0, sa_sess, 1); + + if (PREDICT_FALSE (ret)) + { + to_next[0] = bi0; + to_next += 1; + n_left_to_next -= 1; + goto trace; + } + } + + qp_index = sa_sess->qp_index; + sess = sa_sess->sess; + + ASSERT (vec_len (vec_elt (cwm->qp_data, qp_index).free_cops) > 0); + cop = vec_pop (vec_elt (cwm->qp_data, qp_index).free_cops); + ASSERT (cop->status == RTE_CRYPTO_OP_STATUS_NOT_PROCESSED); + + cops_to_enq[qp_index][0] = cop; + cops_to_enq[qp_index] += 1; + n_cop_qp[qp_index] += 1; + bi_to_enq[qp_index][0] = bi0; + bi_to_enq[qp_index] += 1; + + ssize_t adv; + iv_size = em->esp_crypto_algs[sa0->crypto_alg].iv_len; + if (sa0->crypto_alg == IPSEC_CRYPTO_ALG_AES_GCM_128) + trunc_size = 16; + else + trunc_size = em->esp_integ_algs[sa0->integ_alg].trunc_size; + + ih0 = vlib_buffer_get_current (b0); + orig_sz = b0->current_length; + is_ipv6 = (ih0->ip4.ip_version_and_header_length & 0xF0) == 0x60; + /* is ipv6 */ + if (PREDICT_TRUE (sa0->is_tunnel)) + { + if (PREDICT_TRUE (!is_ipv6)) + adv = -sizeof (ip4_and_esp_header_t); + else + adv = -sizeof (ip6_and_esp_header_t); + } + else + { + adv = -sizeof (esp_header_t); + if (PREDICT_TRUE (!is_ipv6)) + orig_sz -= sizeof (ip4_header_t); + else + orig_sz -= sizeof (ip6_header_t); + } + + /*transport mode save the eth header before it is overwritten */ + if (PREDICT_FALSE (!sa0->is_tunnel)) + { + ethernet_header_t *ieh0 = (ethernet_header_t *) + ((u8 *) vlib_buffer_get_current (b0) - + sizeof (ethernet_header_t)); + ethernet_header_t *oeh0 = + (ethernet_header_t *) ((u8 *) ieh0 + (adv - iv_size)); + clib_memcpy (oeh0, ieh0, sizeof (ethernet_header_t)); + } + + vlib_buffer_advance (b0, adv - iv_size); + + /* XXX IP6/ip4 and IP4/IP6 not supported, only IP4/IP4 and IP6/IP6 */ + + /* is ipv6 */ + if (PREDICT_FALSE (is_ipv6)) + { + ih6_0 = (ip6_and_esp_header_t *) ih0; + ip_hdr_size = sizeof (ip6_header_t); + oh6_0 = vlib_buffer_get_current (b0); + + if (PREDICT_TRUE (sa0->is_tunnel)) + { + next_hdr_type = IP_PROTOCOL_IPV6; + oh6_0->ip6.ip_version_traffic_class_and_flow_label = + ih6_0->ip6.ip_version_traffic_class_and_flow_label; + } + else + { + next_hdr_type = ih6_0->ip6.protocol; + memmove (oh6_0, ih6_0, sizeof (ip6_header_t)); + } + + oh6_0->ip6.protocol = IP_PROTOCOL_IPSEC_ESP; + oh6_0->ip6.hop_limit = 254; + oh6_0->esp.spi = clib_net_to_host_u32 (sa0->spi); + oh6_0->esp.seq = clib_net_to_host_u32 (sa0->seq); + } + else + { + ip_hdr_size = sizeof (ip4_header_t); + oh0 = vlib_buffer_get_current (b0); + + if (PREDICT_TRUE (sa0->is_tunnel)) + { + next_hdr_type = IP_PROTOCOL_IP_IN_IP; + oh0->ip4.tos = ih0->ip4.tos; + } + else + { + next_hdr_type = ih0->ip4.protocol; + memmove (oh0, ih0, sizeof (ip4_header_t)); + } + + oh0->ip4.ip_version_and_header_length = 0x45; + oh0->ip4.fragment_id = 0; + oh0->ip4.flags_and_fragment_offset = 0; + oh0->ip4.ttl = 254; + oh0->ip4.protocol = IP_PROTOCOL_IPSEC_ESP; + oh0->esp.spi = clib_net_to_host_u32 (sa0->spi); + oh0->esp.seq = clib_net_to_host_u32 (sa0->seq); + } + + if (PREDICT_TRUE + (!is_ipv6 && sa0->is_tunnel && !sa0->is_tunnel_ip6)) + { + oh0->ip4.src_address.as_u32 = sa0->tunnel_src_addr.ip4.as_u32; + oh0->ip4.dst_address.as_u32 = sa0->tunnel_dst_addr.ip4.as_u32; + + /* in tunnel mode send it back to FIB */ + next0 = ESP_ENCRYPT_NEXT_IP4_LOOKUP; + vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; + } + else if (is_ipv6 && sa0->is_tunnel && sa0->is_tunnel_ip6) + { + oh6_0->ip6.src_address.as_u64[0] = + sa0->tunnel_src_addr.ip6.as_u64[0]; + oh6_0->ip6.src_address.as_u64[1] = + sa0->tunnel_src_addr.ip6.as_u64[1]; + oh6_0->ip6.dst_address.as_u64[0] = + sa0->tunnel_dst_addr.ip6.as_u64[0]; + oh6_0->ip6.dst_address.as_u64[1] = + sa0->tunnel_dst_addr.ip6.as_u64[1]; + + /* in tunnel mode send it back to FIB */ + next0 = ESP_ENCRYPT_NEXT_IP6_LOOKUP; + vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; + } + else + { + next0 = ESP_ENCRYPT_NEXT_INTERFACE_OUTPUT; + transport_mode = 1; + } + + int blocks = 1 + (orig_sz + 1) / BLOCK_SIZE; + + /* pad packet in input buffer */ + u8 pad_bytes = BLOCK_SIZE * blocks - 2 - orig_sz; + u8 i; + u8 *padding = vlib_buffer_get_current (b0) + b0->current_length; + + for (i = 0; i < pad_bytes; ++i) + padding[i] = i + 1; + + f0 = vlib_buffer_get_current (b0) + b0->current_length + pad_bytes; + f0->pad_length = pad_bytes; + f0->next_header = next_hdr_type; + b0->current_length += pad_bytes + 2 + trunc_size; + + vnet_buffer (b0)->sw_if_index[VLIB_RX] = + vnet_buffer (b0)->sw_if_index[VLIB_RX]; + b0->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID; + + struct rte_crypto_sym_op *sym_cop; + sym_cop = (struct rte_crypto_sym_op *) (cop + 1); + + dpdk_cop_priv_t *priv = (dpdk_cop_priv_t *) (sym_cop + 1); + + vnet_buffer (b0)->unused[0] = next0; + + mb0 = rte_mbuf_from_vlib_buffer (b0); + mb0->data_len = b0->current_length; + mb0->pkt_len = b0->current_length; + mb0->data_off = RTE_PKTMBUF_HEADROOM + b0->current_data; + + dpdk_gcm_cnt_blk *icb = &priv->cb; + + crypto_set_icb (icb, sa0->salt, sa0->seq, sa0->seq_hi); + + u8 is_aead = sa0->crypto_alg == IPSEC_CRYPTO_ALG_AES_GCM_128; + u32 cipher_off, cipher_len; + u32 auth_off = 0, auth_len = 0, aad_size = 0; + u8 *aad = NULL, *digest = NULL; + u64 digest_paddr; + + digest = + vlib_buffer_get_current (b0) + b0->current_length - trunc_size; + + digest_paddr = mb0->buf_physaddr + (digest - (u8 *) mb0->buf_addr); + + if (is_aead) + { + u32 *esp_iv = + (u32 *) (b0->data + b0->current_data + ip_hdr_size + + sizeof (esp_header_t)); + esp_iv[0] = sa0->seq; + esp_iv[1] = sa0->seq_hi; + + cipher_off = ip_hdr_size + sizeof (esp_header_t) + iv_size; + cipher_len = BLOCK_SIZE * blocks; + iv_size = 16; /* GCM IV size, not ESP IV size */ + + aad = priv->aad; + clib_memcpy (aad, vlib_buffer_get_current (b0) + ip_hdr_size, + 8); + aad_size = 8; + if (PREDICT_FALSE (sa0->use_esn)) + { + *((u32 *) & aad[8]) = sa0->seq_hi; + aad_size = 12; + } + } + else + { + cipher_off = ip_hdr_size + sizeof (esp_header_t); + cipher_len = BLOCK_SIZE * blocks + iv_size; + + auth_off = ip_hdr_size; + auth_len = b0->current_length - ip_hdr_size - trunc_size; + + if (PREDICT_FALSE (sa0->use_esn)) + { + *((u32 *) digest) = sa0->seq_hi; + auth_len += sizeof (sa0->seq_hi); + } + } + + crypto_op_setup (is_aead, mb0, cop, sess, + cipher_off, cipher_len, (u8 *) icb, iv_size, + auth_off, auth_len, aad, aad_size, + digest, digest_paddr, trunc_size); + + if (PREDICT_FALSE (is_ipv6)) + { + oh6_0->ip6.payload_length = + clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0) - + sizeof (ip6_header_t)); + } + else + { + oh0->ip4.length = + clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0)); + oh0->ip4.checksum = ip4_header_checksum (&oh0->ip4); + } + + if (transport_mode) + vlib_buffer_advance (b0, -sizeof (ethernet_header_t)); + + trace: + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + esp_encrypt_trace_t *tr = + vlib_add_trace (vm, node, b0, sizeof (*tr)); + tr->spi = sa0->spi; + tr->seq = sa0->seq - 1; + tr->crypto_alg = sa0->crypto_alg; + tr->integ_alg = sa0->integ_alg; + } + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + vlib_node_increment_counter (vm, dpdk_esp_encrypt_node.index, + ESP_ENCRYPT_ERROR_RX_PKTS, + from_frame->n_vectors); + crypto_qp_data_t *qpd; + /* *INDENT-OFF* */ + vec_foreach_index (i, cwm->qp_data) + { + u32 enq; + + if (!n_cop_qp[i]) + continue; + + qpd = vec_elt_at_index(cwm->qp_data, i); + enq = rte_cryptodev_enqueue_burst(qpd->dev_id, qpd->qp_id, + qpd->cops, n_cop_qp[i]); + qpd->inflights += enq; + + if (PREDICT_FALSE(enq < n_cop_qp[i])) + { + crypto_free_cop (qpd, &qpd->cops[enq], n_cop_qp[i] - enq); + vlib_buffer_free (vm, &qpd->bi[enq], n_cop_qp[i] - enq); + + vlib_node_increment_counter (vm, dpdk_esp_encrypt_node.index, + ESP_ENCRYPT_ERROR_ENQ_FAIL, + n_cop_qp[i] - enq); + } + } + /* *INDENT-ON* */ + + return from_frame->n_vectors; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (dpdk_esp_encrypt_node) = { + .function = dpdk_esp_encrypt_node_fn, + .name = "dpdk-esp-encrypt", + .flags = VLIB_NODE_FLAG_IS_OUTPUT, + .vector_size = sizeof (u32), + .format_trace = format_esp_encrypt_trace, + .n_errors = ARRAY_LEN (esp_encrypt_error_strings), + .error_strings = esp_encrypt_error_strings, + .n_next_nodes = 1, + .next_nodes = + { + [ESP_ENCRYPT_NEXT_DROP] = "error-drop", + } +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (dpdk_esp_encrypt_node, dpdk_esp_encrypt_node_fn) +/* + * ESP Encrypt Post Node + */ +#define foreach_esp_encrypt_post_error \ + _(PKTS, "ESP post pkts") + typedef enum + { +#define _(sym,str) ESP_ENCRYPT_POST_ERROR_##sym, + foreach_esp_encrypt_post_error +#undef _ + ESP_ENCRYPT_POST_N_ERROR, + } esp_encrypt_post_error_t; + + static char *esp_encrypt_post_error_strings[] = { +#define _(sym,string) string, + foreach_esp_encrypt_post_error +#undef _ + }; + +vlib_node_registration_t dpdk_esp_encrypt_post_node; + +static u8 * +format_esp_encrypt_post_trace (u8 * s, va_list * args) +{ + return s; +} + +static uword +dpdk_esp_encrypt_post_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + u32 n_left_from, *from, *to_next = 0, next_index; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0, next0; + vlib_buffer_t *b0 = 0; + + bi0 = from[0]; + from += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + to_next[0] = bi0; + to_next += 1; + + next0 = vnet_buffer (b0)->unused[0]; + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, bi0, + next0); + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + vlib_node_increment_counter (vm, dpdk_esp_encrypt_post_node.index, + ESP_ENCRYPT_POST_ERROR_PKTS, + from_frame->n_vectors); + + return from_frame->n_vectors; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (dpdk_esp_encrypt_post_node) = { + .function = dpdk_esp_encrypt_post_node_fn, + .name = "dpdk-esp-encrypt-post", + .vector_size = sizeof (u32), + .format_trace = format_esp_encrypt_post_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN (esp_encrypt_post_error_strings), + .error_strings = esp_encrypt_post_error_strings, + .n_next_nodes = ESP_ENCRYPT_N_NEXT, + .next_nodes = + { +#define _(s,n) [ESP_ENCRYPT_NEXT_##s] = n, + foreach_esp_encrypt_next +#undef _ + } +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (dpdk_esp_encrypt_post_node, + dpdk_esp_encrypt_post_node_fn) +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/dpdk/ipsec/ipsec.c b/src/plugins/dpdk/ipsec/ipsec.c new file mode 100644 index 00000000..7783171f --- /dev/null +++ b/src/plugins/dpdk/ipsec/ipsec.c @@ -0,0 +1,501 @@ +/* + * Copyright (c) 2016 Intel and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vnet/vnet.h> +#include <vnet/ip/ip.h> +#include <vnet/api_errno.h> +#include <vnet/ipsec/ipsec.h> +#include <vlib/node_funcs.h> + +#include <dpdk/device/dpdk.h> +#include <dpdk/ipsec/ipsec.h> +#include <dpdk/ipsec/esp.h> + +#define DPDK_CRYPTO_NB_SESS_OBJS 20000 +#define DPDK_CRYPTO_CACHE_SIZE 512 +#define DPDK_CRYPTO_PRIV_SIZE 128 +#define DPDK_CRYPTO_N_QUEUE_DESC 1024 +#define DPDK_CRYPTO_NB_COPS (1024 * 4) + +static int +add_del_sa_sess (u32 sa_index, u8 is_add) +{ + dpdk_crypto_main_t *dcm = &dpdk_crypto_main; + crypto_worker_main_t *cwm; + u8 skip_master = vlib_num_workers () > 0; + + /* *INDENT-OFF* */ + vec_foreach (cwm, dcm->workers_main) + { + crypto_sa_session_t *sa_sess; + u8 is_outbound; + + if (skip_master) + { + skip_master = 0; + continue; + } + + for (is_outbound = 0; is_outbound < 2; is_outbound++) + { + if (is_add) + { + pool_get (cwm->sa_sess_d[is_outbound], sa_sess); + } + else + { + u8 dev_id; + i32 ret; + + sa_sess = pool_elt_at_index (cwm->sa_sess_d[is_outbound], sa_index); + dev_id = cwm->qp_data[sa_sess->qp_index].dev_id; + + if (!sa_sess->sess) + continue; +#if DPDK_NO_AEAD + ret = (rte_cryptodev_sym_session_free(dev_id, sa_sess->sess) == NULL); + ASSERT (ret); +#else + ret = rte_cryptodev_sym_session_clear(dev_id, sa_sess->sess); + ASSERT (!ret); + + ret = rte_cryptodev_sym_session_free(sa_sess->sess); + ASSERT (!ret); +#endif + memset(sa_sess, 0, sizeof(sa_sess[0])); + } + } + } + /* *INDENT-OFF* */ + + return 0; +} + +static void +update_qp_data (crypto_worker_main_t * cwm, + u8 cdev_id, u16 qp_id, u8 is_outbound, u16 * idx) +{ + crypto_qp_data_t *qpd; + + /* *INDENT-OFF* */ + vec_foreach_index (*idx, cwm->qp_data) + { + qpd = vec_elt_at_index(cwm->qp_data, *idx); + + if (qpd->dev_id == cdev_id && qpd->qp_id == qp_id && + qpd->is_outbound == is_outbound) + return; + } + /* *INDENT-ON* */ + + vec_add2_aligned (cwm->qp_data, qpd, 1, CLIB_CACHE_LINE_BYTES); + + qpd->dev_id = cdev_id; + qpd->qp_id = qp_id; + qpd->is_outbound = is_outbound; +} + +/* + * return: + * 0: already exist + * 1: mapped + */ +static int +add_mapping (crypto_worker_main_t * cwm, + u8 cdev_id, u16 qp, u8 is_outbound, + const struct rte_cryptodev_capabilities *cipher_cap, + const struct rte_cryptodev_capabilities *auth_cap) +{ + u16 qp_index; + uword key = 0, data, *ret; + crypto_worker_qp_key_t *p_key = (crypto_worker_qp_key_t *) & key; + + p_key->cipher_algo = (u8) cipher_cap->sym.cipher.algo; + p_key->auth_algo = (u8) auth_cap->sym.auth.algo; + p_key->is_outbound = is_outbound; +#if ! DPDK_NO_AEAD + p_key->is_aead = cipher_cap->sym.xform_type == RTE_CRYPTO_SYM_XFORM_AEAD; +#endif + + ret = hash_get (cwm->algo_qp_map, key); + if (ret) + return 0; + + update_qp_data (cwm, cdev_id, qp, is_outbound, &qp_index); + + data = (uword) qp_index; + hash_set (cwm->algo_qp_map, key, data); + + return 1; +} + +/* + * return: + * 0: already exist + * 1: mapped + */ +static int +add_cdev_mapping (crypto_worker_main_t * cwm, + struct rte_cryptodev_info *dev_info, u8 cdev_id, + u16 qp, u8 is_outbound) +{ + const struct rte_cryptodev_capabilities *i, *j; + u32 mapped = 0; + + for (i = dev_info->capabilities; i->op != RTE_CRYPTO_OP_TYPE_UNDEFINED; i++) + { +#if ! DPDK_NO_AEAD + if (i->sym.xform_type == RTE_CRYPTO_SYM_XFORM_AEAD) + { + struct rte_cryptodev_capabilities none = { 0 }; + + if (check_algo_is_supported (i, NULL) != 0) + continue; + + none.sym.auth.algo = RTE_CRYPTO_AUTH_NULL; + + mapped |= add_mapping (cwm, cdev_id, qp, is_outbound, i, &none); + continue; + } +#endif + if (i->sym.xform_type != RTE_CRYPTO_SYM_XFORM_CIPHER) + continue; + + if (check_algo_is_supported (i, NULL) != 0) + continue; + + for (j = dev_info->capabilities; j->op != RTE_CRYPTO_OP_TYPE_UNDEFINED; + j++) + { + if (j->sym.xform_type != RTE_CRYPTO_SYM_XFORM_AUTH) + continue; + + if (check_algo_is_supported (j, NULL) != 0) + continue; + + mapped |= add_mapping (cwm, cdev_id, qp, is_outbound, i, j); + } + } + + return mapped; +} + +static int +check_cryptodev_queues () +{ + u32 n_qs = 0; + u8 cdev_id; + u32 n_req_qs = 2; + + if (vlib_num_workers () > 0) + n_req_qs = vlib_num_workers () * 2; + + for (cdev_id = 0; cdev_id < rte_cryptodev_count (); cdev_id++) + { + struct rte_cryptodev_info cdev_info; + + rte_cryptodev_info_get (cdev_id, &cdev_info); + + if (! + (cdev_info.feature_flags & RTE_CRYPTODEV_FF_SYM_OPERATION_CHAINING)) + continue; + + n_qs += cdev_info.max_nb_queue_pairs; + } + + if (n_qs >= n_req_qs) + return 0; + else + return -1; +} + +static clib_error_t * +dpdk_ipsec_check_support (ipsec_sa_t * sa) +{ + if (sa->crypto_alg == IPSEC_CRYPTO_ALG_AES_GCM_128) + { + if (sa->integ_alg != IPSEC_INTEG_ALG_NONE) + return clib_error_return (0, "unsupported integ-alg %U with " + "crypto-alg aes-gcm-128", + format_ipsec_integ_alg, sa->integ_alg); +#if DPDK_NO_AEAD + sa->integ_alg = IPSEC_INTEG_ALG_AES_GCM_128; +#endif + } +#if DPDK_NO_AEAD + else if (sa->crypto_alg == IPSEC_CRYPTO_ALG_NONE || + sa->integ_alg == IPSEC_INTEG_ALG_NONE || + sa->integ_alg == IPSEC_INTEG_ALG_AES_GCM_128) +#else + else if (sa->integ_alg == IPSEC_INTEG_ALG_NONE) +#endif + return clib_error_return (0, + "unsupported integ-alg %U with crypto-alg %U", + format_ipsec_integ_alg, sa->integ_alg, + format_ipsec_crypto_alg, sa->crypto_alg); + + return 0; +} + +static uword +dpdk_ipsec_process (vlib_main_t * vm, vlib_node_runtime_t * rt, + vlib_frame_t * f) +{ + ipsec_main_t *im = &ipsec_main; + dpdk_crypto_main_t *dcm = &dpdk_crypto_main; + vlib_thread_main_t *tm = vlib_get_thread_main (); + struct rte_cryptodev_config dev_conf; + struct rte_cryptodev_qp_conf qp_conf; + struct rte_cryptodev_info cdev_info; + struct rte_mempool *rmp; + i32 dev_id, ret; + u32 i, skip_master; +#if ! DPDK_NO_AEAD + u32 max_sess_size = 0, sess_size; + i8 socket_id; +#endif + + if (check_cryptodev_queues () < 0) + { + clib_warning ("not enough Cryptodevs, default to OpenSSL IPsec"); + return 0; + } + dcm->enabled = 1; + + vec_alloc (dcm->workers_main, tm->n_vlib_mains); + _vec_len (dcm->workers_main) = tm->n_vlib_mains; + + skip_master = vlib_num_workers () > 0; + + fprintf (stdout, "DPDK Cryptodevs info:\n"); + fprintf (stdout, "dev_id\tn_qp\tnb_obj\tcache_size\n"); + /* HW cryptodevs have higher dev_id, use HW first */ + for (dev_id = rte_cryptodev_count () - 1; dev_id >= 0; dev_id--) + { + u16 max_nb_qp, qp = 0; + + rte_cryptodev_info_get (dev_id, &cdev_info); + + if (! + (cdev_info.feature_flags & RTE_CRYPTODEV_FF_SYM_OPERATION_CHAINING)) + continue; + + max_nb_qp = cdev_info.max_nb_queue_pairs; + + for (i = skip_master; i < tm->n_vlib_mains; i++) + { + u8 is_outbound; + crypto_worker_main_t *cwm; + uword *map; + + cwm = vec_elt_at_index (dcm->workers_main, i); + map = cwm->algo_qp_map; + + if (!map) + { + map = hash_create (0, sizeof (crypto_worker_qp_key_t)); + if (!map) + { + clib_warning ("unable to create hash table for worker %u", + vlib_mains[i]->thread_index); + goto error; + } + cwm->algo_qp_map = map; + } + + for (is_outbound = 0; is_outbound < 2 && qp < max_nb_qp; + is_outbound++) + qp += add_cdev_mapping (cwm, &cdev_info, dev_id, qp, is_outbound); + } + + if (qp == 0) + continue; + + dev_conf.socket_id = rte_cryptodev_socket_id (dev_id); + dev_conf.nb_queue_pairs = cdev_info.max_nb_queue_pairs; +#if DPDK_NO_AEAD + dev_conf.session_mp.nb_objs = DPDK_CRYPTO_NB_SESS_OBJS; + dev_conf.session_mp.cache_size = DPDK_CRYPTO_CACHE_SIZE; +#endif + ret = rte_cryptodev_configure (dev_id, &dev_conf); + if (ret < 0) + { + clib_warning ("cryptodev %u config error", dev_id); + goto error; + } + + qp_conf.nb_descriptors = DPDK_CRYPTO_N_QUEUE_DESC; + for (qp = 0; qp < dev_conf.nb_queue_pairs; qp++) + { +#if DPDK_NO_AEAD + ret = rte_cryptodev_queue_pair_setup (dev_id, qp, &qp_conf, + dev_conf.socket_id); +#else + ret = rte_cryptodev_queue_pair_setup (dev_id, qp, &qp_conf, + dev_conf.socket_id, NULL); +#endif + if (ret < 0) + { + clib_warning ("cryptodev %u qp %u setup error", dev_id, qp); + goto error; + } + } + vec_validate (dcm->cop_pools, dev_conf.socket_id); + +#if ! DPDK_NO_AEAD + sess_size = rte_cryptodev_get_private_session_size (dev_id); + if (sess_size > max_sess_size) + max_sess_size = sess_size; +#endif + + if (!vec_elt (dcm->cop_pools, dev_conf.socket_id)) + { + u8 *pool_name = format (0, "crypto_op_pool_socket%u%c", + dev_conf.socket_id, 0); + + rmp = rte_crypto_op_pool_create ((char *) pool_name, + RTE_CRYPTO_OP_TYPE_SYMMETRIC, + DPDK_CRYPTO_NB_COPS * + (1 + vlib_num_workers ()), + DPDK_CRYPTO_CACHE_SIZE, + DPDK_CRYPTO_PRIV_SIZE, + dev_conf.socket_id); + + if (!rmp) + { + clib_warning ("failed to allocate %s", pool_name); + vec_free (pool_name); + goto error; + } + vec_free (pool_name); + vec_elt (dcm->cop_pools, dev_conf.socket_id) = rmp; + } + + fprintf (stdout, "%u\t%u\t%u\t%u\n", dev_id, dev_conf.nb_queue_pairs, + DPDK_CRYPTO_NB_SESS_OBJS, DPDK_CRYPTO_CACHE_SIZE); + } + +#if ! DPDK_NO_AEAD + /* *INDENT-OFF* */ + vec_foreach_index (socket_id, dcm->cop_pools) + { + u8 *pool_name; + + if (!vec_elt (dcm->cop_pools, socket_id)) + continue; + + vec_validate (dcm->sess_h_pools, socket_id); + pool_name = format (0, "crypto_sess_h_socket%u%c", + socket_id, 0); + rmp = + rte_mempool_create((i8 *)pool_name, DPDK_CRYPTO_NB_SESS_OBJS, + rte_cryptodev_get_header_session_size (), + 512, 0, NULL, NULL, NULL, NULL, + socket_id, 0); + if (!rmp) + { + clib_warning ("failed to allocate %s", pool_name); + vec_free (pool_name); + goto error; + } + vec_free (pool_name); + vec_elt (dcm->sess_h_pools, socket_id) = rmp; + + vec_validate (dcm->sess_pools, socket_id); + pool_name = format (0, "crypto_sess_socket%u%c", + socket_id, 0); + rmp = + rte_mempool_create((i8 *)pool_name, DPDK_CRYPTO_NB_SESS_OBJS, + max_sess_size, 512, 0, NULL, NULL, NULL, NULL, + socket_id, 0); + if (!rmp) + { + clib_warning ("failed to allocate %s", pool_name); + vec_free (pool_name); + goto error; + } + vec_free (pool_name); + vec_elt (dcm->sess_pools, socket_id) = rmp; + } + /* *INDENT-ON* */ +#endif + + dpdk_esp_init (); + + /* Add new next node and set as default */ + vlib_node_t *node, *next_node; + + next_node = vlib_get_node_by_name (vm, (u8 *) "dpdk-esp-encrypt"); + ASSERT (next_node); + node = vlib_get_node_by_name (vm, (u8 *) "ipsec-output-ip4"); + ASSERT (node); + im->esp_encrypt_node_index = next_node->index; + im->esp_encrypt_next_index = + vlib_node_add_next (vm, node->index, next_node->index); + + next_node = vlib_get_node_by_name (vm, (u8 *) "dpdk-esp-decrypt"); + ASSERT (next_node); + node = vlib_get_node_by_name (vm, (u8 *) "ipsec-input-ip4"); + ASSERT (node); + im->esp_decrypt_node_index = next_node->index; + im->esp_decrypt_next_index = + vlib_node_add_next (vm, node->index, next_node->index); + + im->cb.check_support_cb = dpdk_ipsec_check_support; + im->cb.add_del_sa_sess_cb = add_del_sa_sess; + + for (i = skip_master; i < tm->n_vlib_mains; i++) + vlib_node_set_state (vlib_mains[i], dpdk_crypto_input_node.index, + VLIB_NODE_STATE_POLLING); + + /* TODO cryptodev counters */ + + return 0; + +error: + ; + crypto_worker_main_t *cwm; + struct rte_mempool **mp; + /* *INDENT-OFF* */ + vec_foreach (cwm, dcm->workers_main) + hash_free (cwm->algo_qp_map); + + vec_foreach (mp, dcm->cop_pools) + { + if (mp) + rte_mempool_free (mp[0]); + } + /* *INDENT-ON* */ + vec_free (dcm->workers_main); + vec_free (dcm->cop_pools); + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (dpdk_ipsec_process_node,static) = { + .function = dpdk_ipsec_process, + .type = VLIB_NODE_TYPE_PROCESS, + .name = "dpdk-ipsec-process", + .process_log2_n_stack_bytes = 17, +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/dpdk/ipsec/ipsec.h b/src/plugins/dpdk/ipsec/ipsec.h new file mode 100644 index 00000000..a94dd682 --- /dev/null +++ b/src/plugins/dpdk/ipsec/ipsec.h @@ -0,0 +1,242 @@ +/* + * Copyright (c) 2016 Intel and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __DPDK_IPSEC_H__ +#define __DPDK_IPSEC_H__ + +#include <vnet/vnet.h> + +#undef always_inline +#include <rte_config.h> +#include <rte_crypto.h> +#include <rte_cryptodev.h> + +#if CLIB_DEBUG > 0 +#define always_inline static inline +#else +#define always_inline static inline __attribute__ ((__always_inline__)) +#endif + + +#define MAX_QP_PER_LCORE 16 + +typedef struct +{ + u32 salt; + u32 iv[2]; + u32 cnt; +} dpdk_gcm_cnt_blk; + +typedef struct +{ + dpdk_gcm_cnt_blk cb; + union + { + u8 aad[12]; + u8 icv[64]; + }; +} dpdk_cop_priv_t; + +typedef struct +{ + u8 cipher_algo; + u8 auth_algo; + u8 is_outbound; + u8 is_aead; +} crypto_worker_qp_key_t; + +typedef struct +{ + u16 dev_id; + u16 qp_id; + u16 is_outbound; + i16 inflights; + u32 bi[VLIB_FRAME_SIZE]; + struct rte_crypto_op *cops[VLIB_FRAME_SIZE]; + struct rte_crypto_op **free_cops; +} crypto_qp_data_t; + +typedef struct +{ + u8 qp_index; + void *sess; +} crypto_sa_session_t; + +typedef struct +{ + crypto_sa_session_t *sa_sess_d[2]; + crypto_qp_data_t *qp_data; + uword *algo_qp_map; +} crypto_worker_main_t; + +typedef struct +{ + struct rte_mempool **sess_h_pools; + struct rte_mempool **sess_pools; + struct rte_mempool **cop_pools; + crypto_worker_main_t *workers_main; + u8 enabled; +} dpdk_crypto_main_t; + +dpdk_crypto_main_t dpdk_crypto_main; + +extern vlib_node_registration_t dpdk_crypto_input_node; + +#define CRYPTO_N_FREE_COPS (VLIB_FRAME_SIZE * 3) + +static_always_inline void +crypto_alloc_cops () +{ + dpdk_crypto_main_t *dcm = &dpdk_crypto_main; + u32 thread_index = vlib_get_thread_index (); + crypto_worker_main_t *cwm = &dcm->workers_main[thread_index]; + unsigned socket_id = rte_socket_id (); + crypto_qp_data_t *qpd; + + /* *INDENT-OFF* */ + vec_foreach (qpd, cwm->qp_data) + { + u32 l = vec_len (qpd->free_cops); + + if (PREDICT_FALSE (l < VLIB_FRAME_SIZE)) + { + u32 n_alloc; + + if (PREDICT_FALSE (!qpd->free_cops)) + vec_alloc (qpd->free_cops, CRYPTO_N_FREE_COPS); + + n_alloc = rte_crypto_op_bulk_alloc (dcm->cop_pools[socket_id], + RTE_CRYPTO_OP_TYPE_SYMMETRIC, + &qpd->free_cops[l], + CRYPTO_N_FREE_COPS - l - 1); + + _vec_len (qpd->free_cops) = l + n_alloc; + } + } + /* *INDENT-ON* */ +} + +static_always_inline void +crypto_free_cop (crypto_qp_data_t * qpd, struct rte_crypto_op **cops, u32 n) +{ + u32 l = vec_len (qpd->free_cops); + + if (l + n >= CRYPTO_N_FREE_COPS) + { + l -= VLIB_FRAME_SIZE; + rte_mempool_put_bulk (cops[0]->mempool, + (void **) &qpd->free_cops[l], VLIB_FRAME_SIZE); + } + clib_memcpy (&qpd->free_cops[l], cops, sizeof (*cops) * n); + + _vec_len (qpd->free_cops) = l + n; +} + +static_always_inline int +check_algo_is_supported (const struct rte_cryptodev_capabilities *cap, + char *name) +{ + struct + { + enum rte_crypto_sym_xform_type type; + union + { + enum rte_crypto_auth_algorithm auth; + enum rte_crypto_cipher_algorithm cipher; +#if ! DPDK_NO_AEAD + enum rte_crypto_aead_algorithm aead; +#endif + }; + char *name; + } supported_algo[] = + { + { + .type = RTE_CRYPTO_SYM_XFORM_CIPHER,.cipher = + RTE_CRYPTO_CIPHER_NULL,.name = "NULL"}, + { + .type = RTE_CRYPTO_SYM_XFORM_CIPHER,.cipher = + RTE_CRYPTO_CIPHER_AES_CBC,.name = "AES_CBC"}, +#if DPDK_NO_AEAD + { + .type = RTE_CRYPTO_SYM_XFORM_CIPHER,.cipher = + RTE_CRYPTO_CIPHER_AES_GCM,.name = "AES-GCM"}, +#else + { + .type = RTE_CRYPTO_SYM_XFORM_AEAD,.aead = + RTE_CRYPTO_AEAD_AES_GCM,.name = "AES-GCM"}, +#endif + { + .type = RTE_CRYPTO_SYM_XFORM_AUTH,.auth = + RTE_CRYPTO_AUTH_NULL,.name = "NULL"}, + { + .type = RTE_CRYPTO_SYM_XFORM_AUTH,.auth = + RTE_CRYPTO_AUTH_SHA1_HMAC,.name = "HMAC-SHA1"}, + { + .type = RTE_CRYPTO_SYM_XFORM_AUTH,.auth = + RTE_CRYPTO_AUTH_SHA256_HMAC,.name = "HMAC-SHA256"}, + { + .type = RTE_CRYPTO_SYM_XFORM_AUTH,.auth = + RTE_CRYPTO_AUTH_SHA384_HMAC,.name = "HMAC-SHA384"}, + { + .type = RTE_CRYPTO_SYM_XFORM_AUTH,.auth = + RTE_CRYPTO_AUTH_SHA512_HMAC,.name = "HMAC-SHA512"}, +#if DPDK_NO_AEAD + { + .type = RTE_CRYPTO_SYM_XFORM_AUTH,.auth = + RTE_CRYPTO_AUTH_AES_GCM,.name = "AES-GCM"}, +#endif + { + /* tail */ + .type = RTE_CRYPTO_SYM_XFORM_NOT_SPECIFIED} + }; + + uint32_t i = 0; + + if (cap->op != RTE_CRYPTO_OP_TYPE_SYMMETRIC) + return -1; + + while (supported_algo[i].type != RTE_CRYPTO_SYM_XFORM_NOT_SPECIFIED) + { + if (cap->sym.xform_type == supported_algo[i].type) + { + if ((cap->sym.xform_type == RTE_CRYPTO_SYM_XFORM_CIPHER && + cap->sym.cipher.algo == supported_algo[i].cipher) || +#if ! DPDK_NO_AEAD + (cap->sym.xform_type == RTE_CRYPTO_SYM_XFORM_AEAD && + cap->sym.aead.algo == supported_algo[i].aead) || +#endif + (cap->sym.xform_type == RTE_CRYPTO_SYM_XFORM_AUTH && + cap->sym.auth.algo == supported_algo[i].auth)) + { + if (name) + strcpy (name, supported_algo[i].name); + return 0; + } + } + + i++; + } + + return -1; +} + +#endif /* __DPDK_IPSEC_H__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/dpdk/main.c b/src/plugins/dpdk/main.c new file mode 100644 index 00000000..f2f1ba22 --- /dev/null +++ b/src/plugins/dpdk/main.c @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vnet/vnet.h> +#include <vnet/plugin/plugin.h> +#include <dpdk/device/dpdk.h> +#include <vpp/app/version.h> + +/* + * Called by the dpdk driver's rte_delay_us() function. + * Return 0 to have the dpdk do a regular delay loop. + * Return 1 if to skip the delay loop because we are suspending + * the calling vlib process instead. + */ +static int +rte_delay_us_override (unsigned us) +{ + vlib_main_t *vm; + + /* Don't bother intercepting for short delays */ + if (us < 10) + return 0; + + /* + * Only intercept if we are in a vlib process. + * If we are called from a vlib worker thread or the vlib main + * thread then do not intercept. (Must not be called from an + * independent pthread). + */ + if (vlib_get_thread_index () == 0) + { + /* + * We're in the vlib main thread or a vlib process. Make sure + * the process is running and we're not still initializing. + */ + vm = vlib_get_main (); + if (vlib_in_process_context (vm)) + { + /* Only suspend for the admin_down_process */ + vlib_process_t *proc = vlib_get_current_process (vm); + if (!(proc->flags & VLIB_PROCESS_IS_RUNNING) || + (proc->node_runtime.function != admin_up_down_process)) + return 0; + + f64 delay = 1e-6 * us; + vlib_process_suspend (vm, delay); + return 1; + } + } + return 0; // no override +} + +static void +rte_delay_us_override_cb (unsigned us) +{ + if (rte_delay_us_override (us) == 0) + rte_delay_us_block (us); +} + +static clib_error_t * dpdk_main_init (vlib_main_t * vm) +{ + dpdk_main_t * dm = &dpdk_main; + clib_error_t * error = 0; + + dm->vlib_main = vm; + dm->vnet_main = vnet_get_main (); + + if ((error = vlib_call_init_function (vm, dpdk_init))) + return error; + + /* register custom delay function */ + rte_delay_us_callback_register (rte_delay_us_override_cb); + + return error; +} + +VLIB_INIT_FUNCTION (dpdk_main_init); + +/* *INDENT-OFF* */ +VLIB_PLUGIN_REGISTER () = { + .version = VPP_BUILD_VER, + .description = "Data Plane Development Kit (DPDK)", +}; +/* *INDENT-ON* */ diff --git a/src/plugins/dpdk/thread.c b/src/plugins/dpdk/thread.c new file mode 100644 index 00000000..3a3fcc6c --- /dev/null +++ b/src/plugins/dpdk/thread.c @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <rte_config.h> + +#include <rte_common.h> +#include <rte_log.h> +#include <rte_memory.h> +#include <rte_memzone.h> +#include <rte_tailq.h> +#include <rte_eal.h> +#include <rte_per_lcore.h> +#include <rte_launch.h> +#include <rte_atomic.h> +#include <rte_cycles.h> +#include <rte_prefetch.h> +#include <rte_lcore.h> +#include <rte_per_lcore.h> +#include <rte_branch_prediction.h> +#include <rte_interrupts.h> +#include <rte_pci.h> +#include <rte_random.h> +#include <rte_debug.h> +#include <rte_ether.h> +#include <rte_ethdev.h> +#include <rte_ring.h> +#include <rte_mempool.h> +#include <rte_mbuf.h> +#include <rte_version.h> + +#include <vlib/vlib.h> +#include <vnet/vnet.h> +#include <dpdk/device/dpdk.h> +#include <dpdk/device/dpdk_priv.h> + +static clib_error_t * +dpdk_launch_thread (void *fp, vlib_worker_thread_t * w, unsigned lcore_id) +{ + int r; + r = rte_eal_remote_launch (fp, (void *) w, lcore_id); + if (r) + return clib_error_return (0, "Failed to launch thread %u", lcore_id); + return 0; +} + +static clib_error_t * +dpdk_thread_set_lcore (u32 thread, u16 lcore) +{ + return 0; +} + +static vlib_thread_callbacks_t callbacks = { + .vlib_launch_thread_cb = &dpdk_launch_thread, + .vlib_thread_set_lcore_cb = &dpdk_thread_set_lcore, +}; + +static clib_error_t * +dpdk_thread_init (vlib_main_t * vm) +{ + vlib_thread_cb_register (vm, &callbacks); + return 0; +} + +VLIB_INIT_FUNCTION (dpdk_thread_init); + +/** @endcond */ +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/flowprobe.am b/src/plugins/flowprobe.am new file mode 100644 index 00000000..c56e246d --- /dev/null +++ b/src/plugins/flowprobe.am @@ -0,0 +1,37 @@ + +# Copyright (c) <current-year> <your-organization> +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +vppplugins_LTLIBRARIES += flowprobe_plugin.la +vppapitestplugins_LTLIBRARIES += flowprobe_test_plugin.la + +flowprobe_plugin_la_SOURCES = flowprobe/flowprobe.c \ + flowprobe/node.c \ + flowprobe/flowprobe_plugin.api.h + +BUILT_SOURCES += \ + flowprobe/flowprobe.api.h \ + flowprobe/flowprobe.api.json + +noinst_HEADERS += \ + flowprobe/flowprobe_all_api_h.h \ + flowprobe/flowprobe_msg_enum.h \ + flowprobe/flowprobe.api.h + +flowprobe_test_plugin_la_SOURCES = \ + flowprobe/flowprobe_test.c \ + flowprobe/flowprobe_plugin.api.h + +API_FILES += flowprobe/flowprobe.api + +# vi:syntax=automake diff --git a/src/plugins/flowprobe/flowprobe.api b/src/plugins/flowprobe/flowprobe.api new file mode 100644 index 00000000..3f8c583b --- /dev/null +++ b/src/plugins/flowprobe/flowprobe.api @@ -0,0 +1,40 @@ +/* Define a simple enable-disable binary API to control the feature */ + +/** \file + This file defines the vpp control-plane API messages + used to control the flowprobe plugin +*/ + +/** \brief Enable / disable per-packet IPFIX recording on an interface + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param is_add - add address if non-zero, else delete + @param is_ipv6 - if non-zero the address is ipv6, else ipv4 + @param sw_if_index - index of the interface +*/ +autoreply manual_print define flowprobe_tx_interface_add_del +{ + /* Client identifier, set from api_main.my_client_index */ + u32 client_index; + + /* Arbitrary context, so client can match reply to request */ + u32 context; + + /* Enable / disable the feature */ + u8 is_add; + u8 which; /* 0 = ipv4, 1 = l2, 2 = ipv6 */ + + /* Interface handle */ + u32 sw_if_index; +}; + +autoreply define flowprobe_params +{ + u32 client_index; + u32 context; + u8 record_l2; + u8 record_l3; + u8 record_l4; + u32 active_timer; /* ~0 is off, 0 is default */ + u32 passive_timer; /* ~0 is off, 0 is default */ +}; diff --git a/src/plugins/flowprobe/flowprobe.c b/src/plugins/flowprobe/flowprobe.c new file mode 100644 index 00000000..884b5a2e --- /dev/null +++ b/src/plugins/flowprobe/flowprobe.c @@ -0,0 +1,1137 @@ +/* + * flowprobe.c - ipfix probe plugin + * + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file + * @brief Per-packet IPFIX flow record generator plugin + * + * This file implements vpp plugin registration mechanics, + * debug CLI, and binary API handling. + */ + +#include <vnet/vnet.h> +#include <vpp/app/version.h> +#include <vnet/plugin/plugin.h> +#include <flowprobe/flowprobe.h> + +#include <vlibapi/api.h> +#include <vlibmemory/api.h> +#include <vlibsocket/api.h> + +/* define message IDs */ +#include <flowprobe/flowprobe_msg_enum.h> + +/* define message structures */ +#define vl_typedefs +#include <flowprobe/flowprobe_all_api_h.h> +#undef vl_typedefs + +/* define generated endian-swappers */ +#define vl_endianfun +#include <flowprobe/flowprobe_all_api_h.h> +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__) +#define vl_printfun +#include <flowprobe/flowprobe_all_api_h.h> +#undef vl_printfun + +flowprobe_main_t flowprobe_main; +vlib_node_registration_t flowprobe_walker_node; +static vlib_node_registration_t flowprobe_timer_node; +uword flowprobe_walker_process (vlib_main_t * vm, vlib_node_runtime_t * rt, + vlib_frame_t * f); + +/* Get the API version number */ +#define vl_api_version(n,v) static u32 api_version=(v); +#include <flowprobe/flowprobe_all_api_h.h> +#undef vl_api_version + +#define REPLY_MSG_ID_BASE fm->msg_id_base +#include <vlibapi/api_helper_macros.h> + +/* Define the per-interface configurable features */ +/* *INDENT-OFF* */ +VNET_FEATURE_INIT (flow_perpacket_ip4, static) = +{ + .arc_name = "ip4-output", + .node_name = "flowprobe-ip4", + .runs_before = VNET_FEATURES ("interface-output"), +}; + +VNET_FEATURE_INIT (flow_perpacket_ip6, static) = +{ + .arc_name = "ip6-output", + .node_name = "flowprobe-ip6", + .runs_before = VNET_FEATURES ("interface-output"), +}; + +VNET_FEATURE_INIT (flow_perpacket_l2, static) = +{ + .arc_name = "interface-output", + .node_name = "flowprobe-l2", + .runs_before = VNET_FEATURES ("interface-tx"), +}; +/* *INDENT-ON* */ + +/* Macro to finish up custom dump fns */ +#define FINISH \ + vec_add1 (s, 0); \ + vl_print (handle, (char *)s); \ + vec_free (s); \ + return handle; + +static inline ipfix_field_specifier_t * +flowprobe_template_ip4_fields (ipfix_field_specifier_t * f) +{ +#define flowprobe_template_ip4_field_count() 4 + /* sourceIpv4Address, TLV type 8, u32 */ + f->e_id_length = ipfix_e_id_length (0 /* enterprise */ , + sourceIPv4Address, 4); + f++; + /* destinationIPv4Address, TLV type 12, u32 */ + f->e_id_length = ipfix_e_id_length (0 /* enterprise */ , + destinationIPv4Address, 4); + f++; + /* protocolIdentifier, TLV type 4, u8 */ + f->e_id_length = ipfix_e_id_length (0 /* enterprise */ , + protocolIdentifier, 1); + f++; + /* octetDeltaCount, TLV type 1, u64 */ + f->e_id_length = ipfix_e_id_length (0 /* enterprise */ , + octetDeltaCount, 8); + f++; + return f; +} + +static inline ipfix_field_specifier_t * +flowprobe_template_ip6_fields (ipfix_field_specifier_t * f) +{ +#define flowprobe_template_ip6_field_count() 4 + /* sourceIpv6Address, TLV type 27, 16 octets */ + f->e_id_length = ipfix_e_id_length (0 /* enterprise */ , + sourceIPv6Address, 16); + f++; + /* destinationIPv6Address, TLV type 28, 16 octets */ + f->e_id_length = ipfix_e_id_length (0 /* enterprise */ , + destinationIPv6Address, 16); + f++; + /* protocolIdentifier, TLV type 4, u8 */ + f->e_id_length = ipfix_e_id_length (0 /* enterprise */ , + protocolIdentifier, 1); + f++; + /* octetDeltaCount, TLV type 1, u64 */ + f->e_id_length = ipfix_e_id_length (0 /* enterprise */ , + octetDeltaCount, 8); + f++; + return f; +} + +static inline ipfix_field_specifier_t * +flowprobe_template_l2_fields (ipfix_field_specifier_t * f) +{ +#define flowprobe_template_l2_field_count() 3 + /* sourceMacAddress, TLV type 56, u8[6] we hope */ + f->e_id_length = ipfix_e_id_length (0 /* enterprise */ , + sourceMacAddress, 6); + f++; + /* destinationMacAddress, TLV type 80, u8[6] we hope */ + f->e_id_length = ipfix_e_id_length (0 /* enterprise */ , + destinationMacAddress, 6); + f++; + /* ethernetType, TLV type 256, u16 */ + f->e_id_length = ipfix_e_id_length (0 /* enterprise */ , + ethernetType, 2); + f++; + return f; +} + +static inline ipfix_field_specifier_t * +flowprobe_template_common_fields (ipfix_field_specifier_t * f) +{ +#define flowprobe_template_common_field_count() 5 + /* ingressInterface, TLV type 10, u32 */ + f->e_id_length = ipfix_e_id_length (0 /* enterprise */ , + ingressInterface, 4); + f++; + + /* egressInterface, TLV type 14, u32 */ + f->e_id_length = ipfix_e_id_length (0 /* enterprise */ , + egressInterface, 4); + f++; + + /* packetDeltaCount, TLV type 2, u64 */ + f->e_id_length = ipfix_e_id_length (0 /* enterprise */ , + packetDeltaCount, 8); + f++; + + /* flowStartNanoseconds, TLV type 156, u64 */ + f->e_id_length = ipfix_e_id_length (0 /* enterprise */ , + flowStartNanoseconds, 8); + f++; + + /* flowEndNanoseconds, TLV type 157, u64 */ + f->e_id_length = ipfix_e_id_length (0 /* enterprise */ , + flowEndNanoseconds, 8); + f++; + + return f; +} + +static inline ipfix_field_specifier_t * +flowprobe_template_l4_fields (ipfix_field_specifier_t * f) +{ +#define flowprobe_template_l4_field_count() 3 + /* sourceTransportPort, TLV type 7, u16 */ + f->e_id_length = ipfix_e_id_length (0 /* enterprise */ , + sourceTransportPort, 2); + f++; + /* destinationTransportPort, TLV type 11, u16 */ + f->e_id_length = ipfix_e_id_length (0 /* enterprise */ , + destinationTransportPort, 2); + f++; + /* tcpControlBits, TLV type 6, u16 */ + f->e_id_length = ipfix_e_id_length (0 /* enterprise */ , + tcpControlBits, 2); + f++; + + return f; +} + +/** + * @brief Create an IPFIX template packet rewrite string + * @param frm flow_report_main_t * + * @param fr flow_report_t * + * @param collector_address ip4_address_t * the IPFIX collector address + * @param src_address ip4_address_t * the source address we should use + * @param collector_port u16 the collector port we should use, host byte order + * @returns u8 * vector containing the indicated IPFIX template packet + */ +static inline u8 * +flowprobe_template_rewrite_inline (flow_report_main_t * frm, + flow_report_t * fr, + ip4_address_t * collector_address, + ip4_address_t * src_address, + u16 collector_port, + flowprobe_variant_t which) +{ + ip4_header_t *ip; + udp_header_t *udp; + ipfix_message_header_t *h; + ipfix_set_header_t *s; + ipfix_template_header_t *t; + ipfix_field_specifier_t *f; + ipfix_field_specifier_t *first_field; + u8 *rewrite = 0; + ip4_ipfix_template_packet_t *tp; + u32 field_count = 0; + flow_report_stream_t *stream; + flowprobe_main_t *fm = &flowprobe_main; + flowprobe_record_t flags = fr->opaque.as_uword; + bool collect_ip4 = false, collect_ip6 = false; + + stream = &frm->streams[fr->stream_index]; + + if (flags & FLOW_RECORD_L3) + { + collect_ip4 = which == FLOW_VARIANT_L2_IP4 || which == FLOW_VARIANT_IP4; + collect_ip6 = which == FLOW_VARIANT_L2_IP6 || which == FLOW_VARIANT_IP6; + if (which == FLOW_VARIANT_L2_IP4) + flags |= FLOW_RECORD_L2_IP4; + if (which == FLOW_VARIANT_L2_IP6) + flags |= FLOW_RECORD_L2_IP6; + } + + field_count += flowprobe_template_common_field_count (); + if (flags & FLOW_RECORD_L2) + field_count += flowprobe_template_l2_field_count (); + if (collect_ip4) + field_count += flowprobe_template_ip4_field_count (); + if (collect_ip6) + field_count += flowprobe_template_ip6_field_count (); + if (flags & FLOW_RECORD_L4) + field_count += flowprobe_template_l4_field_count (); + + /* allocate rewrite space */ + vec_validate_aligned + (rewrite, sizeof (ip4_ipfix_template_packet_t) + + field_count * sizeof (ipfix_field_specifier_t) - 1, + CLIB_CACHE_LINE_BYTES); + + tp = (ip4_ipfix_template_packet_t *) rewrite; + ip = (ip4_header_t *) & tp->ip4; + udp = (udp_header_t *) (ip + 1); + h = (ipfix_message_header_t *) (udp + 1); + s = (ipfix_set_header_t *) (h + 1); + t = (ipfix_template_header_t *) (s + 1); + first_field = f = (ipfix_field_specifier_t *) (t + 1); + + ip->ip_version_and_header_length = 0x45; + ip->ttl = 254; + ip->protocol = IP_PROTOCOL_UDP; + ip->src_address.as_u32 = src_address->as_u32; + ip->dst_address.as_u32 = collector_address->as_u32; + udp->src_port = clib_host_to_net_u16 (stream->src_port); + udp->dst_port = clib_host_to_net_u16 (collector_port); + udp->length = clib_host_to_net_u16 (vec_len (rewrite) - sizeof (*ip)); + + /* FIXUP: message header export_time */ + /* FIXUP: message header sequence_number */ + h->domain_id = clib_host_to_net_u32 (stream->domain_id); + + /* Add TLVs to the template */ + f = flowprobe_template_common_fields (f); + + if (flags & FLOW_RECORD_L2) + f = flowprobe_template_l2_fields (f); + if (collect_ip4) + f = flowprobe_template_ip4_fields (f); + if (collect_ip6) + f = flowprobe_template_ip6_fields (f); + if (flags & FLOW_RECORD_L4) + f = flowprobe_template_l4_fields (f); + + /* Back to the template packet... */ + ip = (ip4_header_t *) & tp->ip4; + udp = (udp_header_t *) (ip + 1); + + ASSERT (f - first_field); + /* Field count in this template */ + t->id_count = ipfix_id_count (fr->template_id, f - first_field); + + fm->template_size[flags] = (u8 *) f - (u8 *) s; + + /* set length in octets */ + s->set_id_length = + ipfix_set_id_length (2 /* set_id */ , (u8 *) f - (u8 *) s); + + /* message length in octets */ + h->version_length = version_length ((u8 *) f - (u8 *) h); + + ip->length = clib_host_to_net_u16 ((u8 *) f - (u8 *) ip); + ip->checksum = ip4_header_checksum (ip); + + return rewrite; +} + +static u8 * +flowprobe_template_rewrite_ip6 (flow_report_main_t * frm, + flow_report_t * fr, + ip4_address_t * collector_address, + ip4_address_t * src_address, + u16 collector_port) +{ + return flowprobe_template_rewrite_inline + (frm, fr, collector_address, src_address, collector_port, + FLOW_VARIANT_IP6); +} + +static u8 * +flowprobe_template_rewrite_ip4 (flow_report_main_t * frm, + flow_report_t * fr, + ip4_address_t * collector_address, + ip4_address_t * src_address, + u16 collector_port) +{ + return flowprobe_template_rewrite_inline + (frm, fr, collector_address, src_address, collector_port, + FLOW_VARIANT_IP4); +} + +static u8 * +flowprobe_template_rewrite_l2 (flow_report_main_t * frm, + flow_report_t * fr, + ip4_address_t * collector_address, + ip4_address_t * src_address, + u16 collector_port) +{ + return flowprobe_template_rewrite_inline + (frm, fr, collector_address, src_address, collector_port, + FLOW_VARIANT_L2); +} + +static u8 * +flowprobe_template_rewrite_l2_ip4 (flow_report_main_t * frm, + flow_report_t * fr, + ip4_address_t * collector_address, + ip4_address_t * src_address, + u16 collector_port) +{ + return flowprobe_template_rewrite_inline + (frm, fr, collector_address, src_address, collector_port, + FLOW_VARIANT_L2_IP4); +} + +static u8 * +flowprobe_template_rewrite_l2_ip6 (flow_report_main_t * frm, + flow_report_t * fr, + ip4_address_t * collector_address, + ip4_address_t * src_address, + u16 collector_port) +{ + return flowprobe_template_rewrite_inline + (frm, fr, collector_address, src_address, collector_port, + FLOW_VARIANT_L2_IP6); +} + +/** + * @brief Flush accumulated data + * @param frm flow_report_main_t * + * @param fr flow_report_t * + * @param f vlib_frame_t * + * + * <em>Notes:</em> + * This function must simply return the incoming frame, or no template packets + * will be sent. + */ +vlib_frame_t * +flowprobe_data_callback_ip4 (flow_report_main_t * frm, + flow_report_t * fr, + vlib_frame_t * f, u32 * to_next, u32 node_index) +{ + flowprobe_flush_callback_ip4 (); + return f; +} + +vlib_frame_t * +flowprobe_data_callback_ip6 (flow_report_main_t * frm, + flow_report_t * fr, + vlib_frame_t * f, u32 * to_next, u32 node_index) +{ + flowprobe_flush_callback_ip6 (); + return f; +} + +vlib_frame_t * +flowprobe_data_callback_l2 (flow_report_main_t * frm, + flow_report_t * fr, + vlib_frame_t * f, u32 * to_next, u32 node_index) +{ + flowprobe_flush_callback_l2 (); + return f; +} + +static int +flowprobe_template_add_del (u32 domain_id, u16 src_port, + flowprobe_record_t flags, + vnet_flow_data_callback_t * flow_data_callback, + vnet_flow_rewrite_callback_t * rewrite_callback, + bool is_add, u16 * template_id) +{ + flow_report_main_t *frm = &flow_report_main; + vnet_flow_report_add_del_args_t a = { + .rewrite_callback = rewrite_callback, + .flow_data_callback = flow_data_callback, + .is_add = is_add, + .domain_id = domain_id, + .src_port = src_port, + .opaque.as_uword = flags, + }; + return vnet_flow_report_add_del (frm, &a, template_id); +} + +static void +flowprobe_expired_timer_callback (u32 * expired_timers) +{ + vlib_main_t *vm = vlib_get_main (); + flowprobe_main_t *fm = &flowprobe_main; + u32 my_cpu_number = vm->thread_index; + int i; + u32 poolindex; + + for (i = 0; i < vec_len (expired_timers); i++) + { + poolindex = expired_timers[i] & 0x7FFFFFFF; + vec_add1 (fm->expired_passive_per_worker[my_cpu_number], poolindex); + } +} + +static clib_error_t * +flowprobe_create_state_tables (u32 active_timer) +{ + flowprobe_main_t *fm = &flowprobe_main; + vlib_thread_main_t *tm = &vlib_thread_main; + vlib_main_t *vm = vlib_get_main (); + clib_error_t *error = 0; + u32 num_threads; + int i; + + /* Decide how many worker threads we have */ + num_threads = 1 /* main thread */ + tm->n_threads; + + /* Hash table per worker */ + fm->ht_log2len = FLOWPROBE_LOG2_HASHSIZE; + + /* Init per worker flow state and timer wheels */ + if (active_timer) + { + vec_validate (fm->timers_per_worker, num_threads - 1); + vec_validate (fm->expired_passive_per_worker, num_threads - 1); + vec_validate (fm->hash_per_worker, num_threads - 1); + vec_validate (fm->pool_per_worker, num_threads - 1); + + for (i = 0; i < num_threads; i++) + { + int j; + pool_alloc (fm->pool_per_worker[i], 1 << fm->ht_log2len); + vec_resize (fm->hash_per_worker[i], 1 << fm->ht_log2len); + for (j = 0; j < (1 << fm->ht_log2len); j++) + fm->hash_per_worker[i][j] = ~0; + fm->timers_per_worker[i] = + clib_mem_alloc (sizeof (TWT (tw_timer_wheel))); + tw_timer_wheel_init_2t_1w_2048sl (fm->timers_per_worker[i], + flowprobe_expired_timer_callback, + 1.0, 1024); + } + fm->disabled = true; + } + else + { + f64 now = vlib_time_now (vm); + vec_validate (fm->stateless_entry, num_threads - 1); + for (i = 0; i < num_threads; i++) + fm->stateless_entry[i].last_exported = now; + fm->disabled = false; + } + fm->initialized = true; + return error; +} + +static int +validate_feature_on_interface (flowprobe_main_t * fm, u32 sw_if_index, + u8 which) +{ + vec_validate_init_empty (fm->flow_per_interface, sw_if_index, ~0); + + if (fm->flow_per_interface[sw_if_index] == (u8) ~ 0) + return -1; + else if (fm->flow_per_interface[sw_if_index] != which) + return 0; + else + return 1; +} + +/** + * @brief configure / deconfigure the IPFIX flow-per-packet + * @param fm flowprobe_main_t * fm + * @param sw_if_index u32 the desired interface + * @param is_add int 1 to enable the feature, 0 to disable it + * @returns 0 if successful, non-zero otherwise + */ + +static int +flowprobe_tx_interface_add_del_feature (flowprobe_main_t * fm, + u32 sw_if_index, u8 which, int is_add) +{ + vlib_main_t *vm = vlib_get_main (); + int rv = 0; + u16 template_id = 0; + flowprobe_record_t flags = fm->record; + + fm->flow_per_interface[sw_if_index] = (is_add) ? which : (u8) ~ 0; + fm->template_per_flow[which] += (is_add) ? 1 : -1; + if (is_add && fm->template_per_flow[which] > 1) + template_id = fm->template_reports[flags]; + + if ((is_add && fm->template_per_flow[which] == 1) || + (!is_add && fm->template_per_flow[which] == 0)) + { + if (which == FLOW_VARIANT_L2) + { + if (fm->record & FLOW_RECORD_L2) + { + rv = flowprobe_template_add_del (1, UDP_DST_PORT_ipfix, flags, + flowprobe_data_callback_l2, + flowprobe_template_rewrite_l2, + is_add, &template_id); + } + if (fm->record & FLOW_RECORD_L3 || fm->record & FLOW_RECORD_L4) + { + rv = flowprobe_template_add_del (1, UDP_DST_PORT_ipfix, flags, + flowprobe_data_callback_l2, + flowprobe_template_rewrite_l2_ip4, + is_add, &template_id); + fm->template_reports[flags | FLOW_RECORD_L2_IP4] = + (is_add) ? template_id : 0; + rv = + flowprobe_template_add_del (1, UDP_DST_PORT_ipfix, flags, + flowprobe_data_callback_l2, + flowprobe_template_rewrite_l2_ip6, + is_add, &template_id); + fm->template_reports[flags | FLOW_RECORD_L2_IP6] = + (is_add) ? template_id : 0; + + /* Special case L2 */ + fm->context[FLOW_VARIANT_L2_IP4].flags = + flags | FLOW_RECORD_L2_IP4; + fm->context[FLOW_VARIANT_L2_IP6].flags = + flags | FLOW_RECORD_L2_IP6; + + fm->template_reports[flags] = template_id; + } + } + else if (which == FLOW_VARIANT_IP4) + rv = flowprobe_template_add_del (1, UDP_DST_PORT_ipfix, flags, + flowprobe_data_callback_ip4, + flowprobe_template_rewrite_ip4, + is_add, &template_id); + else if (which == FLOW_VARIANT_IP6) + rv = flowprobe_template_add_del (1, UDP_DST_PORT_ipfix, flags, + flowprobe_data_callback_ip6, + flowprobe_template_rewrite_ip6, + is_add, &template_id); + } + if (rv && rv != VNET_API_ERROR_VALUE_EXIST) + { + clib_warning ("vnet_flow_report_add_del returned %d", rv); + return -1; + } + + if (which != (u8) ~ 0) + { + fm->context[which].flags = fm->record; + fm->template_reports[flags] = (is_add) ? template_id : 0; + } + + if (which == FLOW_VARIANT_IP4) + vnet_feature_enable_disable ("ip4-output", "flowprobe-ip4", + sw_if_index, is_add, 0, 0); + else if (which == FLOW_VARIANT_IP6) + vnet_feature_enable_disable ("ip6-output", "flowprobe-ip6", + sw_if_index, is_add, 0, 0); + else if (which == FLOW_VARIANT_L2) + vnet_feature_enable_disable ("interface-output", "flowprobe-l2", + sw_if_index, is_add, 0, 0); + + /* Stateful flow collection */ + if (is_add && !fm->initialized) + { + flowprobe_create_state_tables (fm->active_timer); + if (fm->active_timer) + vlib_process_signal_event (vm, flowprobe_timer_node.index, 1, 0); + } + + return 0; +} + +/** + * @brief API message handler + * @param mp vl_api_flowprobe_tx_interface_add_del_t * mp the api message + */ +void vl_api_flowprobe_tx_interface_add_del_t_handler + (vl_api_flowprobe_tx_interface_add_del_t * mp) +{ + flowprobe_main_t *fm = &flowprobe_main; + vl_api_flowprobe_tx_interface_add_del_reply_t *rmp; + u32 sw_if_index = ntohl (mp->sw_if_index); + int rv = 0; + + VALIDATE_SW_IF_INDEX (mp); + + if (mp->which != FLOW_VARIANT_IP4 && mp->which != FLOW_VARIANT_L2 + && mp->which != FLOW_VARIANT_IP6) + { + rv = VNET_API_ERROR_UNIMPLEMENTED; + goto out; + } + + if (fm->record == 0) + { + clib_warning ("Please specify flowprobe params record first..."); + rv = VNET_API_ERROR_CANNOT_ENABLE_DISABLE_FEATURE; + goto out; + } + + rv = validate_feature_on_interface (fm, sw_if_index, mp->which); + if ((rv == 1 && mp->is_add == 1) || rv == 0) + { + rv = VNET_API_ERROR_CANNOT_ENABLE_DISABLE_FEATURE; + goto out; + } + + rv = flowprobe_tx_interface_add_del_feature + (fm, sw_if_index, mp->which, mp->is_add); + +out: + BAD_SW_IF_INDEX_LABEL; + + REPLY_MACRO (VL_API_FLOWPROBE_TX_INTERFACE_ADD_DEL_REPLY); +} + +/** + * @brief API message custom-dump function + * @param mp vl_api_flowprobe_tx_interface_add_del_t * mp the api message + * @param handle void * print function handle + * @returns u8 * output string + */ +static void *vl_api_flowprobe_tx_interface_add_del_t_print + (vl_api_flowprobe_tx_interface_add_del_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: flowprobe_tx_interface_add_del "); + s = format (s, "sw_if_index %d is_add %d which %d ", + clib_host_to_net_u32 (mp->sw_if_index), + (int) mp->is_add, (int) mp->which); + FINISH; +} + +#define vec_neg_search(v,E) \ +({ \ + word _v(i) = 0; \ + while (_v(i) < vec_len(v) && v[_v(i)] == E) \ + { \ + _v(i)++; \ + } \ + if (_v(i) == vec_len(v)) \ + _v(i) = ~0; \ + _v(i); \ +}) + +static int +flowprobe_params (flowprobe_main_t * fm, u8 record_l2, + u8 record_l3, u8 record_l4, + u32 active_timer, u32 passive_timer) +{ + flowprobe_record_t flags = 0; + + if (vec_neg_search (fm->flow_per_interface, (u8) ~ 0) != ~0) + return ~0; + + if (record_l2) + flags |= FLOW_RECORD_L2; + if (record_l3) + flags |= FLOW_RECORD_L3; + if (record_l4) + flags |= FLOW_RECORD_L4; + + fm->record = flags; + + /* + * Timers: ~0 is default, 0 is off + */ + fm->active_timer = + (active_timer == (u32) ~ 0 ? FLOWPROBE_TIMER_ACTIVE : active_timer); + fm->passive_timer = + (passive_timer == (u32) ~ 0 ? FLOWPROBE_TIMER_PASSIVE : passive_timer); + + return 0; +} + +void +vl_api_flowprobe_params_t_handler (vl_api_flowprobe_params_t * mp) +{ + flowprobe_main_t *fm = &flowprobe_main; + vl_api_flowprobe_params_reply_t *rmp; + int rv = 0; + + rv = flowprobe_params + (fm, mp->record_l2, mp->record_l3, mp->record_l4, + clib_net_to_host_u32 (mp->active_timer), + clib_net_to_host_u32 (mp->passive_timer)); + + REPLY_MACRO (VL_API_FLOWPROBE_PARAMS_REPLY); +} + +/* List of message types that this plugin understands */ +#define foreach_flowprobe_plugin_api_msg \ +_(FLOWPROBE_TX_INTERFACE_ADD_DEL, flowprobe_tx_interface_add_del) \ +_(FLOWPROBE_PARAMS, flowprobe_params) + +/* *INDENT-OFF* */ +VLIB_PLUGIN_REGISTER () = { + .version = VPP_BUILD_VER, + .description = "Flow per Packet", +}; +/* *INDENT-ON* */ + +u8 * +format_flowprobe_entry (u8 * s, va_list * args) +{ + flowprobe_entry_t *e = va_arg (*args, flowprobe_entry_t *); + s = format (s, " %d/%d", e->key.rx_sw_if_index, e->key.tx_sw_if_index); + + s = format (s, " %U %U", format_ethernet_address, &e->key.src_mac, + format_ethernet_address, &e->key.dst_mac); + s = format (s, " %U -> %U", + format_ip46_address, &e->key.src_address, IP46_TYPE_ANY, + format_ip46_address, &e->key.dst_address, IP46_TYPE_ANY); + s = format (s, " %d", e->key.protocol); + s = format (s, " %d %d\n", clib_net_to_host_u16 (e->key.src_port), + clib_net_to_host_u16 (e->key.dst_port)); + + return s; +} + +static clib_error_t * +flowprobe_show_table_fn (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cm) +{ + flowprobe_main_t *fm = &flowprobe_main; + int i; + flowprobe_entry_t *e; + + vlib_cli_output (vm, "Dumping IPFIX table"); + + for (i = 0; i < vec_len (fm->pool_per_worker); i++) + { + /* *INDENT-OFF* */ + pool_foreach (e, fm->pool_per_worker[i], ( + { + vlib_cli_output (vm, "%U", + format_flowprobe_entry, + e); + })); + /* *INDENT-ON* */ + + } + return 0; +} + +static clib_error_t * +flowprobe_show_stats_fn (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cm) +{ + flowprobe_main_t *fm = &flowprobe_main; + int i; + + vlib_cli_output (vm, "IPFIX table statistics"); + vlib_cli_output (vm, "Flow entry size: %d\n", sizeof (flowprobe_entry_t)); + vlib_cli_output (vm, "Flow pool size per thread: %d\n", + 0x1 << FLOWPROBE_LOG2_HASHSIZE); + + for (i = 0; i < vec_len (fm->pool_per_worker); i++) + vlib_cli_output (vm, "Pool utilisation thread %d is %d%%\n", i, + (100 * pool_elts (fm->pool_per_worker[i])) / + (0x1 << FLOWPROBE_LOG2_HASHSIZE)); + return 0; +} + +static clib_error_t * +flowprobe_tx_interface_add_del_feature_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + flowprobe_main_t *fm = &flowprobe_main; + u32 sw_if_index = ~0; + int is_add = 1; + u8 which = FLOW_VARIANT_IP4; + int rv; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "disable")) + is_add = 0; + else if (unformat (input, "%U", unformat_vnet_sw_interface, + fm->vnet_main, &sw_if_index)); + else if (unformat (input, "ip4")) + which = FLOW_VARIANT_IP4; + else if (unformat (input, "ip6")) + which = FLOW_VARIANT_IP6; + else if (unformat (input, "l2")) + which = FLOW_VARIANT_L2; + else + break; + } + + if (fm->record == 0) + return clib_error_return (0, + "Please specify flowprobe params record first..."); + + if (sw_if_index == ~0) + return clib_error_return (0, "Please specify an interface..."); + + rv = validate_feature_on_interface (fm, sw_if_index, which); + if (rv == 1) + { + if (is_add) + return clib_error_return (0, + "Datapath is already enabled for given interface..."); + } + else if (rv == 0) + return clib_error_return (0, + "Interface has enable different datapath ..."); + + rv = + flowprobe_tx_interface_add_del_feature (fm, sw_if_index, which, is_add); + switch (rv) + { + case 0: + break; + + case VNET_API_ERROR_INVALID_SW_IF_INDEX: + return clib_error_return + (0, "Invalid interface, only works on physical ports"); + break; + + case VNET_API_ERROR_UNIMPLEMENTED: + return clib_error_return (0, "ip6 not supported"); + break; + + default: + return clib_error_return (0, "flowprobe_enable_disable returned %d", + rv); + } + return 0; +} + +static clib_error_t * +flowprobe_params_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + flowprobe_main_t *fm = &flowprobe_main; + bool record_l2 = false, record_l3 = false, record_l4 = false; + u32 active_timer = ~0; + u32 passive_timer = ~0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "active %d", &active_timer)) + ; + else if (unformat (input, "passive %d", &passive_timer)) + ; + else if (unformat (input, "record")) + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "l2")) + record_l2 = true; + else if (unformat (input, "l3")) + record_l3 = true; + else if (unformat (input, "l4")) + record_l4 = true; + else + break; + } + else + break; + } + + if (passive_timer > 0 && active_timer > passive_timer) + return clib_error_return (0, + "Passive timer has to be greater than active one..."); + + if (flowprobe_params (fm, record_l2, record_l3, record_l4, + active_timer, passive_timer)) + return clib_error_return (0, + "Couldn't change flowperpacket params when feature is enabled on some interface ..."); + return 0; +} + +/*? + * '<em>flowprobe feature add-del</em>' commands to enable/disable + * per-packet IPFIX flow record generation on an interface + * + * @cliexpar + * @parblock + * To enable per-packet IPFIX flow-record generation on an interface: + * @cliexcmd{flowprobe feature add-del GigabitEthernet2/0/0} + * + * To disable per-packet IPFIX flow-record generation on an interface: + * @cliexcmd{flowprobe feature add-del GigabitEthernet2/0/0 disable} + * @cliexend + * @endparblock +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (flowprobe_enable_disable_command, static) = { + .path = "flowprobe feature add-del", + .short_help = + "flowprobe feature add-del <interface-name> <l2|ip4|ip6> disable", + .function = flowprobe_tx_interface_add_del_feature_command_fn, +}; +VLIB_CLI_COMMAND (flowprobe_params_command, static) = { + .path = "flowprobe params", + .short_help = + "flowprobe params record <[l2] [l3] [l4]> [active <timer> passive <timer>]", + .function = flowprobe_params_command_fn, +}; +VLIB_CLI_COMMAND (flowprobe_show_table_command, static) = { + .path = "show flowprobe table", + .short_help = "show flowprobe table", + .function = flowprobe_show_table_fn, +}; +VLIB_CLI_COMMAND (flowprobe_show_stats_command, static) = { + .path = "show flowprobe statistics", + .short_help = "show flowprobe statistics", + .function = flowprobe_show_stats_fn, +}; +/* *INDENT-ON* */ + +/** + * @brief Set up the API message handling tables + * @param vm vlib_main_t * vlib main data structure pointer + * @returns 0 to indicate all is well + */ +static clib_error_t * +flowprobe_plugin_api_hookup (vlib_main_t * vm) +{ + flowprobe_main_t *fm = &flowprobe_main; +#define _(N,n) \ + vl_msg_api_set_handlers((VL_API_##N + fm->msg_id_base), \ + #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_flowprobe_plugin_api_msg; +#undef _ + + return 0; +} + +#define vl_msg_name_crc_list +#include <flowprobe/flowprobe_all_api_h.h> +#undef vl_msg_name_crc_list + +static void +setup_message_id_table (flowprobe_main_t * fm, api_main_t * am) +{ +#define _(id,n,crc) \ + vl_msg_api_add_msg_name_crc (am, #n "_" #crc, id + fm->msg_id_base); + foreach_vl_msg_name_crc_flowprobe; +#undef _ +} + +/* + * Main-core process, sending an interrupt to the per worker input + * process that spins the per worker timer wheel. + */ +static uword +timer_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f) +{ + uword *event_data = 0; + vlib_main_t **worker_vms = 0, *worker_vm; + flowprobe_main_t *fm = &flowprobe_main; + + /* Wait for Godot... */ + vlib_process_wait_for_event_or_clock (vm, 1e9); + uword event_type = vlib_process_get_events (vm, &event_data); + if (event_type != 1) + clib_warning ("bogus kickoff event received, %d", event_type); + vec_reset_length (event_data); + + int i; + if (vec_len (vlib_mains) == 0) + vec_add1 (worker_vms, vm); + else + { + for (i = 0; i < vec_len (vlib_mains); i++) + { + worker_vm = vlib_mains[i]; + if (worker_vm) + vec_add1 (worker_vms, worker_vm); + } + } + f64 sleep_duration = 0.1; + + while (1) + { + /* Send an interrupt to each timer input node */ + sleep_duration = 0.1; + for (i = 0; i < vec_len (worker_vms); i++) + { + worker_vm = worker_vms[i]; + if (worker_vm) + { + vlib_node_set_interrupt_pending (worker_vm, + flowprobe_walker_node.index); + sleep_duration = + (fm->expired_passive_per_worker[i] > 0) ? 1e-4 : 0.1; + } + } + vlib_process_suspend (vm, sleep_duration); + } + return 0; /* or not */ +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (flowprobe_timer_node,static) = { + .function = timer_process, + .name = "flowprobe-timer-process", + .type = VLIB_NODE_TYPE_PROCESS, +}; +/* *INDENT-ON* */ + +/** + * @brief Set up the API message handling tables + * @param vm vlib_main_t * vlib main data structure pointer + * @returns 0 to indicate all is well, or a clib_error_t + */ +static clib_error_t * +flowprobe_init (vlib_main_t * vm) +{ + flowprobe_main_t *fm = &flowprobe_main; + vlib_thread_main_t *tm = &vlib_thread_main; + clib_error_t *error = 0; + u8 *name; + u32 num_threads; + int i; + + fm->vnet_main = vnet_get_main (); + + /* Construct the API name */ + name = format (0, "flowprobe_%08x%c", api_version, 0); + + /* Ask for a correctly-sized block of API message decode slots */ + fm->msg_id_base = vl_msg_api_get_msg_ids + ((char *) name, VL_MSG_FIRST_AVAILABLE); + + /* Hook up message handlers */ + error = flowprobe_plugin_api_hookup (vm); + + /* Add our API messages to the global name_crc hash table */ + setup_message_id_table (fm, &api_main); + + vec_free (name); + + /* Set up time reference pair */ + fm->vlib_time_0 = vlib_time_now (vm); + fm->nanosecond_time_0 = unix_time_now_nsec (); + + memset (fm->template_reports, 0, sizeof (fm->template_reports)); + memset (fm->template_size, 0, sizeof (fm->template_size)); + memset (fm->template_per_flow, 0, sizeof (fm->template_per_flow)); + + /* Decide how many worker threads we have */ + num_threads = 1 /* main thread */ + tm->n_threads; + + /* Allocate per worker thread vectors per flavour */ + for (i = 0; i < FLOW_N_VARIANTS; i++) + { + vec_validate (fm->context[i].buffers_per_worker, num_threads - 1); + vec_validate (fm->context[i].frames_per_worker, num_threads - 1); + vec_validate (fm->context[i].next_record_offset_per_worker, + num_threads - 1); + } + + fm->active_timer = FLOWPROBE_TIMER_ACTIVE; + fm->passive_timer = FLOWPROBE_TIMER_PASSIVE; + + return error; +} + +VLIB_INIT_FUNCTION (flowprobe_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/flowprobe/flowprobe.h b/src/plugins/flowprobe/flowprobe.h new file mode 100644 index 00000000..02ee053c --- /dev/null +++ b/src/plugins/flowprobe/flowprobe.h @@ -0,0 +1,174 @@ +/* + * flowprobe.h - ipfix probe plug-in header file + * + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_flowprobe_h__ +#define __included_flowprobe_h__ + +#include <vnet/vnet.h> +#include <vnet/ip/ip.h> +#include <vnet/ethernet/ethernet.h> + +#include <vppinfra/hash.h> +#include <vppinfra/error.h> +#include <vnet/flow/flow_report.h> +#include <vnet/flow/flow_report_classify.h> +#include <vppinfra/tw_timer_2t_1w_2048sl.h> + +/* Default timers in seconds */ +#define FLOWPROBE_TIMER_ACTIVE (15) +#define FLOWPROBE_TIMER_PASSIVE 120 // XXXX: FOR TESTING (30*60) +#define FLOWPROBE_LOG2_HASHSIZE (18) + +typedef enum +{ + FLOW_RECORD_L2 = 1 << 0, + FLOW_RECORD_L3 = 1 << 1, + FLOW_RECORD_L4 = 1 << 2, + FLOW_RECORD_L2_IP4 = 1 << 3, + FLOW_RECORD_L2_IP6 = 1 << 4, + FLOW_N_RECORDS = 1 << 5, +} flowprobe_record_t; + +/* *INDENT-OFF* */ +typedef enum __attribute__ ((__packed__)) +{ + FLOW_VARIANT_IP4, + FLOW_VARIANT_IP6, + FLOW_VARIANT_L2, + FLOW_VARIANT_L2_IP4, + FLOW_VARIANT_L2_IP6, + FLOW_N_VARIANTS, +} flowprobe_variant_t; +/* *INDENT-ON* */ + +STATIC_ASSERT (sizeof (flowprobe_variant_t) == 1, + "flowprobe_variant_t is expected to be 1 byte, " + "revisit padding in flowprobe_key_t"); + +#define FLOW_MAXIMUM_EXPORT_ENTRIES (1024) + +typedef struct +{ + /* what to collect per variant */ + flowprobe_record_t flags; + /** ipfix buffers under construction, per-worker thread */ + vlib_buffer_t **buffers_per_worker; + /** frames containing ipfix buffers, per-worker thread */ + vlib_frame_t **frames_per_worker; + /** next record offset, per worker thread */ + u16 *next_record_offset_per_worker; +} flowprobe_protocol_context_t; + +/* *INDENT-OFF* */ +typedef struct __attribute__ ((aligned (8))) { + u32 rx_sw_if_index; + u32 tx_sw_if_index; + u8 src_mac[6]; + u8 dst_mac[6]; + u16 ethertype; + ip46_address_t src_address; + ip46_address_t dst_address; + u8 protocol; + u16 src_port; + u16 dst_port; + flowprobe_variant_t which; +} flowprobe_key_t; +/* *INDENT-ON* */ + +typedef struct +{ + u32 sec; + u32 nsec; +} timestamp_nsec_t; + +typedef struct +{ + flowprobe_key_t key; + u64 packetcount; + u64 octetcount; + timestamp_nsec_t flow_start; + timestamp_nsec_t flow_end; + f64 last_updated; + f64 last_exported; + u32 passive_timer_handle; + union + { + struct + { + u16 flags; + } tcp; + } prot; +} flowprobe_entry_t; + +/** + * @file + * @brief flow-per-packet plugin header file + */ +typedef struct +{ + /** API message ID base */ + u16 msg_id_base; + + flowprobe_protocol_context_t context[FLOW_N_VARIANTS]; + u16 template_reports[FLOW_N_RECORDS]; + u16 template_size[FLOW_N_RECORDS]; + + /** Time reference pair */ + u64 nanosecond_time_0; + f64 vlib_time_0; + + /** Per CPU flow-state */ + u8 ht_log2len; /* Hash table size is 2^log2len */ + u32 **hash_per_worker; + flowprobe_entry_t **pool_per_worker; + /* *INDENT-OFF* */ + TWT (tw_timer_wheel) ** timers_per_worker; + /* *INDENT-ON* */ + u32 **expired_passive_per_worker; + + flowprobe_record_t record; + u32 active_timer; + u32 passive_timer; + flowprobe_entry_t *stateless_entry; + + bool initialized; + bool disabled; + + u16 template_per_flow[FLOW_N_VARIANTS]; + u8 *flow_per_interface; + + /** convenience vlib_main_t pointer */ + vlib_main_t *vlib_main; + /** convenience vnet_main_t pointer */ + vnet_main_t *vnet_main; +} flowprobe_main_t; + +extern flowprobe_main_t flowprobe_main; + +void flowprobe_flush_callback_ip4 (void); +void flowprobe_flush_callback_ip6 (void); +void flowprobe_flush_callback_l2 (void); +u8 *format_flowprobe_entry (u8 * s, va_list * args); + +#endif + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/flowprobe/flowprobe_all_api_h.h b/src/plugins/flowprobe/flowprobe_all_api_h.h new file mode 100644 index 00000000..1f30eccc --- /dev/null +++ b/src/plugins/flowprobe/flowprobe_all_api_h.h @@ -0,0 +1,18 @@ +/* + * flowprobe_all_api_h.h - plug-in api #include file + * + * Copyright (c) <current-year> <your-organization> + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* Include the generated file, see BUILT_SOURCES in Makefile.am */ +#include <flowprobe/flowprobe.api.h> diff --git a/src/plugins/flowprobe/flowprobe_msg_enum.h b/src/plugins/flowprobe/flowprobe_msg_enum.h new file mode 100644 index 00000000..bc0b21c9 --- /dev/null +++ b/src/plugins/flowprobe/flowprobe_msg_enum.h @@ -0,0 +1,31 @@ +/* + * flowprobe_msg_enum.h - vpp engine plug-in message enumeration + * + * Copyright (c) <current-year> <your-organization> + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef included_flowprobe_msg_enum_h +#define included_flowprobe_msg_enum_h + +#include <vppinfra/byte_order.h> + +#define vl_msg_id(n,h) n, +typedef enum +{ +#include <flowprobe/flowprobe_all_api_h.h> + /* We'll want to know how many messages IDs we need... */ + VL_MSG_FIRST_AVAILABLE, +} vl_msg_id_t; +#undef vl_msg_id + +#endif /* included_flowprobe_msg_enum_h */ diff --git a/src/plugins/flowprobe/flowprobe_plugin_doc.md b/src/plugins/flowprobe/flowprobe_plugin_doc.md new file mode 100644 index 00000000..4c9b2342 --- /dev/null +++ b/src/plugins/flowprobe/flowprobe_plugin_doc.md @@ -0,0 +1,13 @@ +IPFIX flow record plugin {#flowprobe_plugin_doc} +======================== + +## Introduction + +This plugin generates ipfix flow records on interfaces which have the feature enabled + +## Sample configuration + +set ipfix exporter collector 192.168.6.2 src 192.168.6.1 template-interval 20 port 4739 path-mtu 1500 + +flowprobe params record l3 active 20 passive 120 +flowprobe feature add-del GigabitEthernet2/3/0 l2
\ No newline at end of file diff --git a/src/plugins/flowprobe/flowprobe_test.c b/src/plugins/flowprobe/flowprobe_test.c new file mode 100644 index 00000000..91793f55 --- /dev/null +++ b/src/plugins/flowprobe/flowprobe_test.c @@ -0,0 +1,263 @@ +/* + * flowprobe.c - skeleton vpp-api-test plug-in + * + * Copyright (c) <current-year> <your-organization> + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vat/vat.h> +#include <vlibapi/api.h> +#include <vlibmemory/api.h> +#include <vlibsocket/api.h> +#include <vppinfra/error.h> +#include <flowprobe/flowprobe.h> + +#define __plugin_msg_base flowprobe_test_main.msg_id_base +#include <vlibapi/vat_helper_macros.h> + +/** + * @file vpp_api_test plugin + */ + +uword unformat_sw_if_index (unformat_input_t * input, va_list * args); + +/* Declare message IDs */ +#include <flowprobe/flowprobe_msg_enum.h> + +/* define message structures */ +#define vl_typedefs +#include <flowprobe/flowprobe_all_api_h.h> +#undef vl_typedefs + +/* declare message handlers for each api */ + +#define vl_endianfun /* define message structures */ +#include <flowprobe/flowprobe_all_api_h.h> +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) +#define vl_printfun +#include <flowprobe/flowprobe_all_api_h.h> +#undef vl_printfun + +/* Get the API version number. */ +#define vl_api_version(n,v) static u32 api_version=(v); +#include <flowprobe/flowprobe_all_api_h.h> +#undef vl_api_version + +typedef struct +{ + /** API message ID base */ + u16 msg_id_base; + /** vat_main_t pointer */ + vat_main_t *vat_main; +} flowprobe_test_main_t; + +flowprobe_test_main_t flowprobe_test_main; + +#define foreach_standard_reply_retval_handler \ +_(flowprobe_tx_interface_add_del_reply) \ +_(flowprobe_params_reply) + +#define _(n) \ + static void vl_api_##n##_t_handler \ + (vl_api_##n##_t * mp) \ + { \ + vat_main_t * vam = flowprobe_test_main.vat_main; \ + i32 retval = ntohl(mp->retval); \ + if (vam->async_mode) { \ + vam->async_errors += (retval < 0); \ + } else { \ + vam->retval = retval; \ + vam->result_ready = 1; \ + } \ + } +foreach_standard_reply_retval_handler; +#undef _ + +/* + * Table of message reply handlers, must include boilerplate handlers + * we just generated + */ +#define foreach_vpe_api_reply_msg \ +_(FLOWPROBE_TX_INTERFACE_ADD_DEL_REPLY, \ + flowprobe_tx_interface_add_del_reply) \ +_(FLOWPROBE_PARAMS_REPLY, flowprobe_params_reply) + +static int +api_flowprobe_tx_interface_add_del (vat_main_t * vam) +{ + unformat_input_t *i = vam->input; + int enable_disable = 1; + u8 which = FLOW_VARIANT_IP4; + u32 sw_if_index = ~0; + vl_api_flowprobe_tx_interface_add_del_t *mp; + int ret; + + /* Parse args required to build the message */ + while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT) + { + if (unformat (i, "%U", unformat_sw_if_index, vam, &sw_if_index)) + ; + else if (unformat (i, "sw_if_index %d", &sw_if_index)) + ; + else if (unformat (i, "disable")) + enable_disable = 0; + else if (unformat (i, "ip4")) + which = FLOW_VARIANT_IP4; + else if (unformat (i, "ip6")) + which = FLOW_VARIANT_IP6; + else if (unformat (i, "l2")) + which = FLOW_VARIANT_L2; + else + break; + } + + if (sw_if_index == ~0) + { + errmsg ("missing interface name / explicit sw_if_index number \n"); + return -99; + } + + /* Construct the API message */ + M (FLOWPROBE_TX_INTERFACE_ADD_DEL, mp); + mp->sw_if_index = ntohl (sw_if_index); + mp->is_add = enable_disable; + mp->which = which; + + /* send it... */ + S (mp); + + /* Wait for a reply... */ + W (ret); + return ret; +} + +static int +api_flowprobe_params (vat_main_t * vam) +{ + unformat_input_t *i = vam->input; + u8 record_l2 = 0, record_l3 = 0, record_l4 = 0; + u32 active_timer = ~0; + u32 passive_timer = ~0; + vl_api_flowprobe_params_t *mp; + int ret; + + while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT) + { + if (unformat (i, "active %d", &active_timer)) + ; + else if (unformat (i, "passive %d", &passive_timer)) + ; + else if (unformat (i, "record")) + while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT) + { + if (unformat (i, "l2")) + record_l2 = 1; + else if (unformat (i, "l3")) + record_l3 = 1; + else if (unformat (i, "l4")) + record_l4 = 1; + else + break; + } + else + break; + } + + if (passive_timer > 0 && active_timer > passive_timer) + { + errmsg ("Passive timer has to be greater than active one...\n"); + return -99; + } + + /* Construct the API message */ + M (FLOWPROBE_PARAMS, mp); + mp->record_l2 = record_l2; + mp->record_l3 = record_l3; + mp->record_l4 = record_l4; + mp->active_timer = ntohl (active_timer); + mp->passive_timer = ntohl (passive_timer); + + /* send it... */ + S (mp); + + /* Wait for a reply... */ + W (ret); + + return ret; +} + +/* + * List of messages that the api test plugin sends, + * and that the data plane plugin processes + */ +#define foreach_vpe_api_msg \ +_(flowprobe_tx_interface_add_del, "<intfc> [disable]") \ +_(flowprobe_params, "record <[l2] [l3] [l4]> [active <timer> passive <timer>]") + +static void +flowprobe_vat_api_hookup (vat_main_t * vam) +{ + flowprobe_test_main_t *sm = &flowprobe_test_main; + /* Hook up handlers for replies from the data plane plug-in */ +#define _(N,n) \ + vl_msg_api_set_handlers((VL_API_##N + sm->msg_id_base), \ + #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_vpe_api_reply_msg; +#undef _ + + /* API messages we can send */ +#define _(n,h) hash_set_mem (vam->function_by_name, #n, api_##n); + foreach_vpe_api_msg; +#undef _ + + /* Help strings */ +#define _(n,h) hash_set_mem (vam->help_by_name, #n, h); + foreach_vpe_api_msg; +#undef _ +} + +clib_error_t * +vat_plugin_register (vat_main_t * vam) +{ + flowprobe_test_main_t *sm = &flowprobe_test_main; + u8 *name; + + sm->vat_main = vam; + + /* Ask the vpp engine for the first assigned message-id */ + name = format (0, "flowprobe_%08x%c", api_version, 0); + sm->msg_id_base = vl_client_get_first_plugin_msg_id ((char *) name); + + /* Don't attempt to hook up API messages if the data plane plugin is AWOL */ + if (sm->msg_id_base != (u16) ~ 0) + flowprobe_vat_api_hookup (vam); + + vec_free (name); + + return 0; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/flowprobe/node.c b/src/plugins/flowprobe/node.c new file mode 100644 index 00000000..2f7d0025 --- /dev/null +++ b/src/plugins/flowprobe/node.c @@ -0,0 +1,1053 @@ +/* + * node.c - ipfix probe graph node + * + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vlib/vlib.h> +#include <vnet/vnet.h> +#include <vnet/pg/pg.h> +#include <vppinfra/crc32.h> +#include <vppinfra/error.h> +#include <flowprobe/flowprobe.h> +#include <vnet/ip/ip6_packet.h> +#include <vlibmemory/api.h> + +static void flowprobe_export_entry (vlib_main_t * vm, flowprobe_entry_t * e); + +/** + * @file flow record generator graph node + */ + +typedef struct +{ + /** interface handle */ + u32 rx_sw_if_index; + u32 tx_sw_if_index; + /** packet timestamp */ + u64 timestamp; + /** size of the buffer */ + u16 buffer_size; + + /** L2 information */ + u8 src_mac[6]; + u8 dst_mac[6]; + /** Ethertype */ + u16 ethertype; + + /** L3 information */ + ip46_address_t src_address; + ip46_address_t dst_address; + u8 protocol; + u8 tos; + + /** L4 information */ + u16 src_port; + u16 dst_port; + + flowprobe_variant_t which; +} flowprobe_trace_t; + +static char *flowprobe_variant_strings[] = { + [FLOW_VARIANT_IP4] = "IP4", + [FLOW_VARIANT_IP6] = "IP6", + [FLOW_VARIANT_L2] = "L2", + [FLOW_VARIANT_L2_IP4] = "L2-IP4", + [FLOW_VARIANT_L2_IP6] = "L2-IP6", +}; + +/* packet trace format function */ +static u8 * +format_flowprobe_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + flowprobe_trace_t *t = va_arg (*args, flowprobe_trace_t *); + uword indent = format_get_indent (s); + + s = format (s, + "FLOWPROBE[%s]: rx_sw_if_index %d, tx_sw_if_index %d, " + "timestamp %lld, size %d", flowprobe_variant_strings[t->which], + t->rx_sw_if_index, t->tx_sw_if_index, + t->timestamp, t->buffer_size); + + if (t->which == FLOW_VARIANT_L2) + s = format (s, "\n%U -> %U", format_white_space, indent, + format_ethernet_address, &t->src_mac, + format_ethernet_address, &t->dst_mac); + + if (t->protocol > 0 + && (t->which == FLOW_VARIANT_L2_IP4 || t->which == FLOW_VARIANT_IP4 + || t->which == FLOW_VARIANT_L2_IP6 || t->which == FLOW_VARIANT_IP6)) + s = + format (s, "\n%U%U: %U -> %U", format_white_space, indent, + format_ip_protocol, t->protocol, format_ip46_address, + &t->src_address, IP46_TYPE_ANY, format_ip46_address, + &t->dst_address, IP46_TYPE_ANY); + return s; +} + +vlib_node_registration_t flowprobe_ip4_node; +vlib_node_registration_t flowprobe_ip6_node; +vlib_node_registration_t flowprobe_l2_node; + +/* No counters at the moment */ +#define foreach_flowprobe_error \ +_(COLLISION, "Hash table collisions") \ +_(BUFFER, "Buffer allocation error") \ +_(EXPORTED_PACKETS, "Exported packets") \ +_(INPATH, "Exported packets in path") + +typedef enum +{ +#define _(sym,str) FLOWPROBE_ERROR_##sym, + foreach_flowprobe_error +#undef _ + FLOWPROBE_N_ERROR, +} flowprobe_error_t; + +static char *flowprobe_error_strings[] = { +#define _(sym,string) string, + foreach_flowprobe_error +#undef _ +}; + +typedef enum +{ + FLOWPROBE_NEXT_DROP, + FLOWPROBE_NEXT_IP4_LOOKUP, + FLOWPROBE_N_NEXT, +} flowprobe_next_t; + +#define FLOWPROBE_NEXT_NODES { \ + [FLOWPROBE_NEXT_DROP] = "error-drop", \ + [FLOWPROBE_NEXT_IP4_LOOKUP] = "ip4-lookup", \ +} + +static inline flowprobe_variant_t +flowprobe_get_variant (flowprobe_variant_t which, + flowprobe_record_t flags, u16 ethertype) +{ + if (which == FLOW_VARIANT_L2 + && (flags & FLOW_RECORD_L3 || flags & FLOW_RECORD_L4)) + return ethertype == ETHERNET_TYPE_IP6 ? FLOW_VARIANT_L2_IP6 : ethertype == + ETHERNET_TYPE_IP4 ? FLOW_VARIANT_L2_IP4 : FLOW_VARIANT_L2; + return which; +} + +/* + * NTP rfc868 : 2 208 988 800 corresponds to 00:00 1 Jan 1970 GMT + */ +#define NTP_TIMESTAMP 2208988800L + +static inline u32 +flowprobe_common_add (vlib_buffer_t * to_b, flowprobe_entry_t * e, u16 offset) +{ + u16 start = offset; + + /* Ingress interface */ + u32 rx_if = clib_host_to_net_u32 (e->key.rx_sw_if_index); + clib_memcpy (to_b->data + offset, &rx_if, sizeof (rx_if)); + offset += sizeof (rx_if); + + /* Egress interface */ + u32 tx_if = clib_host_to_net_u32 (e->key.tx_sw_if_index); + clib_memcpy (to_b->data + offset, &tx_if, sizeof (tx_if)); + offset += sizeof (tx_if); + + /* packet delta count */ + u64 packetdelta = clib_host_to_net_u64 (e->packetcount); + clib_memcpy (to_b->data + offset, &packetdelta, sizeof (u64)); + offset += sizeof (u64); + + /* flowStartNanoseconds */ + u32 t = clib_host_to_net_u32 (e->flow_start.sec + NTP_TIMESTAMP); + clib_memcpy (to_b->data + offset, &t, sizeof (u32)); + offset += sizeof (u32); + t = clib_host_to_net_u32 (e->flow_start.nsec); + clib_memcpy (to_b->data + offset, &t, sizeof (u32)); + offset += sizeof (u32); + + /* flowEndNanoseconds */ + t = clib_host_to_net_u32 (e->flow_end.sec + NTP_TIMESTAMP); + clib_memcpy (to_b->data + offset, &t, sizeof (u32)); + offset += sizeof (u32); + t = clib_host_to_net_u32 (e->flow_end.nsec); + clib_memcpy (to_b->data + offset, &t, sizeof (u32)); + offset += sizeof (u32); + + return offset - start; +} + +static inline u32 +flowprobe_l2_add (vlib_buffer_t * to_b, flowprobe_entry_t * e, u16 offset) +{ + u16 start = offset; + + /* src mac address */ + clib_memcpy (to_b->data + offset, &e->key.src_mac, 6); + offset += 6; + + /* dst mac address */ + clib_memcpy (to_b->data + offset, &e->key.dst_mac, 6); + offset += 6; + + /* ethertype */ + clib_memcpy (to_b->data + offset, &e->key.ethertype, 2); + offset += 2; + + return offset - start; +} + +static inline u32 +flowprobe_l3_ip6_add (vlib_buffer_t * to_b, flowprobe_entry_t * e, u16 offset) +{ + u16 start = offset; + + /* ip6 src address */ + clib_memcpy (to_b->data + offset, &e->key.src_address, + sizeof (ip6_address_t)); + offset += sizeof (ip6_address_t); + + /* ip6 dst address */ + clib_memcpy (to_b->data + offset, &e->key.dst_address, + sizeof (ip6_address_t)); + offset += sizeof (ip6_address_t); + + /* Protocol */ + to_b->data[offset++] = e->key.protocol; + + /* octetDeltaCount */ + u64 octetdelta = clib_host_to_net_u64 (e->octetcount); + clib_memcpy (to_b->data + offset, &octetdelta, sizeof (u64)); + offset += sizeof (u64); + + return offset - start; +} + +static inline u32 +flowprobe_l3_ip4_add (vlib_buffer_t * to_b, flowprobe_entry_t * e, u16 offset) +{ + u16 start = offset; + + /* ip4 src address */ + clib_memcpy (to_b->data + offset, &e->key.src_address.ip4, + sizeof (ip4_address_t)); + offset += sizeof (ip4_address_t); + + /* ip4 dst address */ + clib_memcpy (to_b->data + offset, &e->key.dst_address.ip4, + sizeof (ip4_address_t)); + offset += sizeof (ip4_address_t); + + /* Protocol */ + to_b->data[offset++] = e->key.protocol; + + /* octetDeltaCount */ + u64 octetdelta = clib_host_to_net_u64 (e->octetcount); + clib_memcpy (to_b->data + offset, &octetdelta, sizeof (u64)); + offset += sizeof (u64); + + return offset - start; +} + +static inline u32 +flowprobe_l4_add (vlib_buffer_t * to_b, flowprobe_entry_t * e, u16 offset) +{ + u16 start = offset; + + /* src port */ + clib_memcpy (to_b->data + offset, &e->key.src_port, 2); + offset += 2; + + /* dst port */ + clib_memcpy (to_b->data + offset, &e->key.dst_port, 2); + offset += 2; + + /* tcp control bits */ + u16 control_bits = htons (e->prot.tcp.flags); + clib_memcpy (to_b->data + offset, &control_bits, 2); + offset += 2; + + return offset - start; +} + +static inline u32 +flowprobe_hash (flowprobe_key_t * k) +{ + flowprobe_main_t *fm = &flowprobe_main; + u32 h = 0; + +#ifdef clib_crc32c_uses_intrinsics + h = clib_crc32c ((u8 *) k, sizeof (*k)); +#else + int i; + u64 tmp = 0; + for (i = 0; i < sizeof (*k) / 8; i++) + tmp ^= ((u64 *) k)[i]; + + h = clib_xxhash (tmp); +#endif + + return h >> (32 - fm->ht_log2len); +} + +flowprobe_entry_t * +flowprobe_lookup (u32 my_cpu_number, flowprobe_key_t * k, u32 * poolindex, + bool * collision) +{ + flowprobe_main_t *fm = &flowprobe_main; + flowprobe_entry_t *e; + u32 h; + + h = (fm->active_timer) ? flowprobe_hash (k) : 0; + + /* Lookup in the flow state pool */ + *poolindex = fm->hash_per_worker[my_cpu_number][h]; + if (*poolindex != ~0) + { + e = pool_elt_at_index (fm->pool_per_worker[my_cpu_number], *poolindex); + if (e) + { + /* Verify key or report collision */ + if (memcmp (k, &e->key, sizeof (flowprobe_key_t))) + *collision = true; + return e; + } + } + + return 0; +} + +flowprobe_entry_t * +flowprobe_create (u32 my_cpu_number, flowprobe_key_t * k, u32 * poolindex) +{ + flowprobe_main_t *fm = &flowprobe_main; + u32 h; + + flowprobe_entry_t *e; + + /* Get my index */ + h = (fm->active_timer) ? flowprobe_hash (k) : 0; + + pool_get (fm->pool_per_worker[my_cpu_number], e); + *poolindex = e - fm->pool_per_worker[my_cpu_number]; + fm->hash_per_worker[my_cpu_number][h] = *poolindex; + + e->key = *k; + + if (fm->passive_timer > 0) + { + e->passive_timer_handle = tw_timer_start_2t_1w_2048sl + (fm->timers_per_worker[my_cpu_number], *poolindex, 0, + fm->passive_timer); + } + return e; +} + +static inline void +add_to_flow_record_state (vlib_main_t * vm, vlib_node_runtime_t * node, + flowprobe_main_t * fm, vlib_buffer_t * b, + timestamp_nsec_t timestamp, u16 length, + flowprobe_variant_t which, flowprobe_trace_t * t) +{ + if (fm->disabled) + return; + + u32 my_cpu_number = vm->thread_index; + u16 octets = 0; + + flowprobe_record_t flags = fm->context[which].flags; + bool collect_ip4 = false, collect_ip6 = false; + ASSERT (b); + ethernet_header_t *eth = vlib_buffer_get_current (b); + u16 ethertype = clib_net_to_host_u16 (eth->type); + /* *INDENT-OFF* */ + flowprobe_key_t k = {}; + /* *INDENT-ON* */ + ip4_header_t *ip4 = 0; + ip6_header_t *ip6 = 0; + udp_header_t *udp = 0; + tcp_header_t *tcp = 0; + u8 tcp_flags = 0; + + if (flags & FLOW_RECORD_L3 || flags & FLOW_RECORD_L4) + { + collect_ip4 = which == FLOW_VARIANT_L2_IP4 || which == FLOW_VARIANT_IP4; + collect_ip6 = which == FLOW_VARIANT_L2_IP6 || which == FLOW_VARIANT_IP6; + } + + k.rx_sw_if_index = vnet_buffer (b)->sw_if_index[VLIB_RX]; + k.tx_sw_if_index = vnet_buffer (b)->sw_if_index[VLIB_TX]; + + k.which = which; + + if (flags & FLOW_RECORD_L2) + { + clib_memcpy (k.src_mac, eth->src_address, 6); + clib_memcpy (k.dst_mac, eth->dst_address, 6); + k.ethertype = ethertype; + } + if (collect_ip6 && ethertype == ETHERNET_TYPE_IP6) + { + ip6 = (ip6_header_t *) (eth + 1); + if (flags & FLOW_RECORD_L3) + { + k.src_address.as_u64[0] = ip6->src_address.as_u64[0]; + k.src_address.as_u64[1] = ip6->src_address.as_u64[1]; + k.dst_address.as_u64[0] = ip6->dst_address.as_u64[0]; + k.dst_address.as_u64[1] = ip6->dst_address.as_u64[1]; + } + k.protocol = ip6->protocol; + if (k.protocol == IP_PROTOCOL_UDP) + udp = (udp_header_t *) (ip6 + 1); + else if (k.protocol == IP_PROTOCOL_TCP) + tcp = (tcp_header_t *) (ip6 + 1); + + octets = clib_net_to_host_u16 (ip6->payload_length) + + sizeof (ip6_header_t); + } + if (collect_ip4 && ethertype == ETHERNET_TYPE_IP4) + { + ip4 = (ip4_header_t *) (eth + 1); + if (flags & FLOW_RECORD_L3) + { + k.src_address.ip4.as_u32 = ip4->src_address.as_u32; + k.dst_address.ip4.as_u32 = ip4->dst_address.as_u32; + } + k.protocol = ip4->protocol; + if ((flags & FLOW_RECORD_L4) && k.protocol == IP_PROTOCOL_UDP) + udp = (udp_header_t *) (ip4 + 1); + else if ((flags & FLOW_RECORD_L4) && k.protocol == IP_PROTOCOL_TCP) + tcp = (tcp_header_t *) (ip4 + 1); + + octets = clib_net_to_host_u16 (ip4->length); + } + + if (udp) + { + k.src_port = udp->src_port; + k.dst_port = udp->dst_port; + } + else if (tcp) + { + k.src_port = tcp->src_port; + k.dst_port = tcp->dst_port; + tcp_flags = tcp->flags; + } + + if (t) + { + t->rx_sw_if_index = k.rx_sw_if_index; + t->tx_sw_if_index = k.tx_sw_if_index; + clib_memcpy (t->src_mac, k.src_mac, 6); + clib_memcpy (t->dst_mac, k.dst_mac, 6); + t->ethertype = k.ethertype; + t->src_address.ip4.as_u32 = k.src_address.ip4.as_u32; + t->dst_address.ip4.as_u32 = k.dst_address.ip4.as_u32; + t->protocol = k.protocol; + t->src_port = k.src_port; + t->dst_port = k.dst_port; + t->which = k.which; + } + + flowprobe_entry_t *e = 0; + f64 now = vlib_time_now (vm); + if (fm->active_timer > 0) + { + u32 poolindex = ~0; + bool collision = false; + + e = flowprobe_lookup (my_cpu_number, &k, &poolindex, &collision); + if (collision) + { + /* Flush data and clean up entry for reuse. */ + if (e->packetcount) + flowprobe_export_entry (vm, e); + e->key = k; + e->flow_start = timestamp; + vlib_node_increment_counter (vm, node->node_index, + FLOWPROBE_ERROR_COLLISION, 1); + } + if (!e) /* Create new entry */ + { + e = flowprobe_create (my_cpu_number, &k, &poolindex); + e->last_exported = now; + e->flow_start = timestamp; + } + } + else + { + e = &fm->stateless_entry[my_cpu_number]; + e->key = k; + } + + if (e) + { + /* Updating entry */ + e->packetcount++; + e->octetcount += octets; + e->last_updated = now; + e->flow_end = timestamp; + e->prot.tcp.flags |= tcp_flags; + if (fm->active_timer == 0 + || (now > e->last_exported + fm->active_timer)) + flowprobe_export_entry (vm, e); + } +} + +static u16 +flowprobe_get_headersize (void) +{ + return sizeof (ip4_header_t) + sizeof (udp_header_t) + + sizeof (ipfix_message_header_t) + sizeof (ipfix_set_header_t); +} + +static void +flowprobe_export_send (vlib_main_t * vm, vlib_buffer_t * b0, + flowprobe_variant_t which) +{ + flowprobe_main_t *fm = &flowprobe_main; + flow_report_main_t *frm = &flow_report_main; + vlib_frame_t *f; + ip4_ipfix_template_packet_t *tp; + ipfix_set_header_t *s; + ipfix_message_header_t *h; + ip4_header_t *ip; + udp_header_t *udp; + flowprobe_record_t flags = fm->context[which].flags; + u32 my_cpu_number = vm->thread_index; + + /* Fill in header */ + flow_report_stream_t *stream; + + /* Nothing to send */ + if (fm->context[which].next_record_offset_per_worker[my_cpu_number] <= + flowprobe_get_headersize ()) + return; + + u32 i, index = vec_len (frm->streams); + for (i = 0; i < index; i++) + if (frm->streams[i].domain_id == 1) + { + index = i; + break; + } + if (i == vec_len (frm->streams)) + { + vec_validate (frm->streams, index); + frm->streams[index].domain_id = 1; + } + stream = &frm->streams[index]; + + tp = vlib_buffer_get_current (b0); + ip = (ip4_header_t *) & tp->ip4; + udp = (udp_header_t *) (ip + 1); + h = (ipfix_message_header_t *) (udp + 1); + s = (ipfix_set_header_t *) (h + 1); + + ip->ip_version_and_header_length = 0x45; + ip->ttl = 254; + ip->protocol = IP_PROTOCOL_UDP; + ip->flags_and_fragment_offset = 0; + ip->src_address.as_u32 = frm->src_address.as_u32; + ip->dst_address.as_u32 = frm->ipfix_collector.as_u32; + udp->src_port = clib_host_to_net_u16 (UDP_DST_PORT_ipfix); + udp->dst_port = clib_host_to_net_u16 (UDP_DST_PORT_ipfix); + udp->checksum = 0; + + /* FIXUP: message header export_time */ + h->export_time = (u32) + (((f64) frm->unix_time_0) + + (vlib_time_now (frm->vlib_main) - frm->vlib_time_0)); + h->export_time = clib_host_to_net_u32 (h->export_time); + h->domain_id = clib_host_to_net_u32 (stream->domain_id); + + /* FIXUP: message header sequence_number */ + h->sequence_number = stream->sequence_number++; + h->sequence_number = clib_host_to_net_u32 (h->sequence_number); + + s->set_id_length = ipfix_set_id_length (fm->template_reports[flags], + b0->current_length - + (sizeof (*ip) + sizeof (*udp) + + sizeof (*h))); + h->version_length = version_length (b0->current_length - + (sizeof (*ip) + sizeof (*udp))); + + ip->length = clib_host_to_net_u16 (b0->current_length); + + ip->checksum = ip4_header_checksum (ip); + udp->length = clib_host_to_net_u16 (b0->current_length - sizeof (*ip)); + + if (frm->udp_checksum) + { + /* RFC 7011 section 10.3.2. */ + udp->checksum = ip4_tcp_udp_compute_checksum (vm, b0, ip); + if (udp->checksum == 0) + udp->checksum = 0xffff; + } + + ASSERT (ip->checksum == ip4_header_checksum (ip)); + + /* Find or allocate a frame */ + f = fm->context[which].frames_per_worker[my_cpu_number]; + if (PREDICT_FALSE (f == 0)) + { + u32 *to_next; + f = vlib_get_frame_to_node (vm, ip4_lookup_node.index); + fm->context[which].frames_per_worker[my_cpu_number] = f; + u32 bi0 = vlib_get_buffer_index (vm, b0); + + /* Enqueue the buffer */ + to_next = vlib_frame_vector_args (f); + to_next[0] = bi0; + f->n_vectors = 1; + } + + vlib_put_frame_to_node (vm, ip4_lookup_node.index, f); + vlib_node_increment_counter (vm, flowprobe_l2_node.index, + FLOWPROBE_ERROR_EXPORTED_PACKETS, 1); + + fm->context[which].frames_per_worker[my_cpu_number] = 0; + fm->context[which].buffers_per_worker[my_cpu_number] = 0; + fm->context[which].next_record_offset_per_worker[my_cpu_number] = + flowprobe_get_headersize (); +} + +static vlib_buffer_t * +flowprobe_get_buffer (vlib_main_t * vm, flowprobe_variant_t which) +{ + flowprobe_main_t *fm = &flowprobe_main; + flow_report_main_t *frm = &flow_report_main; + vlib_buffer_t *b0; + u32 bi0; + vlib_buffer_free_list_t *fl; + u32 my_cpu_number = vm->thread_index; + + /* Find or allocate a buffer */ + b0 = fm->context[which].buffers_per_worker[my_cpu_number]; + + /* Need to allocate a buffer? */ + if (PREDICT_FALSE (b0 == 0)) + { + if (vlib_buffer_alloc (vm, &bi0, 1) != 1) + { + vlib_node_increment_counter (vm, flowprobe_l2_node.index, + FLOWPROBE_ERROR_BUFFER, 1); + return 0; + } + + /* Initialize the buffer */ + b0 = fm->context[which].buffers_per_worker[my_cpu_number] = + vlib_get_buffer (vm, bi0); + fl = + vlib_buffer_get_free_list (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + vlib_buffer_init_for_free_list (b0, fl); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); + + b0->current_data = 0; + b0->current_length = flowprobe_get_headersize (); + b0->flags |= (VLIB_BUFFER_TOTAL_LENGTH_VALID | VLIB_BUFFER_FLOW_REPORT); + vnet_buffer (b0)->sw_if_index[VLIB_RX] = 0; + vnet_buffer (b0)->sw_if_index[VLIB_TX] = frm->fib_index; + fm->context[which].next_record_offset_per_worker[my_cpu_number] = + b0->current_length; + } + + return b0; +} + +static void +flowprobe_export_entry (vlib_main_t * vm, flowprobe_entry_t * e) +{ + u32 my_cpu_number = vm->thread_index; + flowprobe_main_t *fm = &flowprobe_main; + flow_report_main_t *frm = &flow_report_main; + vlib_buffer_t *b0; + bool collect_ip4 = false, collect_ip6 = false; + flowprobe_variant_t which = e->key.which; + flowprobe_record_t flags = fm->context[which].flags; + u16 offset = + fm->context[which].next_record_offset_per_worker[my_cpu_number]; + + if (offset < flowprobe_get_headersize ()) + offset = flowprobe_get_headersize (); + + b0 = flowprobe_get_buffer (vm, which); + /* No available buffer, what to do... */ + if (b0 == 0) + return; + + if (flags & FLOW_RECORD_L3) + { + collect_ip4 = which == FLOW_VARIANT_L2_IP4 || which == FLOW_VARIANT_IP4; + collect_ip6 = which == FLOW_VARIANT_L2_IP6 || which == FLOW_VARIANT_IP6; + } + + offset += flowprobe_common_add (b0, e, offset); + + if (flags & FLOW_RECORD_L2) + offset += flowprobe_l2_add (b0, e, offset); + if (collect_ip6) + offset += flowprobe_l3_ip6_add (b0, e, offset); + if (collect_ip4) + offset += flowprobe_l3_ip4_add (b0, e, offset); + if (flags & FLOW_RECORD_L4) + offset += flowprobe_l4_add (b0, e, offset); + + /* Reset per flow-export counters */ + e->packetcount = 0; + e->octetcount = 0; + e->last_exported = vlib_time_now (vm); + + b0->current_length = offset; + + fm->context[which].next_record_offset_per_worker[my_cpu_number] = offset; + /* Time to flush the buffer? */ + if (offset + fm->template_size[flags] > frm->path_mtu) + flowprobe_export_send (vm, b0, which); +} + +uword +flowprobe_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame, + flowprobe_variant_t which) +{ + u32 n_left_from, *from, *to_next; + flowprobe_next_t next_index; + flowprobe_main_t *fm = &flowprobe_main; + timestamp_nsec_t timestamp; + + unix_time_now_nsec_fraction (×tamp.sec, ×tamp.nsec); + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next >= 2) + { + u32 next0 = FLOWPROBE_NEXT_DROP; + u32 next1 = FLOWPROBE_NEXT_DROP; + u16 len0, len1; + u32 bi0, bi1; + vlib_buffer_t *b0, *b1; + + /* Prefetch next iteration. */ + { + vlib_buffer_t *p2, *p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + + CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE); + } + + /* speculatively enqueue b0 and b1 to the current next frame */ + to_next[0] = bi0 = from[0]; + to_next[1] = bi1 = from[1]; + from += 2; + to_next += 2; + n_left_from -= 2; + n_left_to_next -= 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + vnet_feature_next (vnet_buffer (b0)->sw_if_index[VLIB_TX], + &next0, b0); + vnet_feature_next (vnet_buffer (b1)->sw_if_index[VLIB_TX], + &next1, b1); + + len0 = vlib_buffer_length_in_chain (vm, b0); + ethernet_header_t *eh0 = vlib_buffer_get_current (b0); + u16 ethertype0 = clib_net_to_host_u16 (eh0->type); + + if (PREDICT_TRUE ((b0->flags & VLIB_BUFFER_FLOW_REPORT) == 0)) + add_to_flow_record_state (vm, node, fm, b0, timestamp, len0, + flowprobe_get_variant + (which, fm->context[which].flags, + ethertype0), 0); + + len1 = vlib_buffer_length_in_chain (vm, b1); + ethernet_header_t *eh1 = vlib_buffer_get_current (b1); + u16 ethertype1 = clib_net_to_host_u16 (eh1->type); + + if (PREDICT_TRUE ((b1->flags & VLIB_BUFFER_FLOW_REPORT) == 0)) + add_to_flow_record_state (vm, node, fm, b1, timestamp, len1, + flowprobe_get_variant + (which, fm->context[which].flags, + ethertype1), 0); + + /* verify speculative enqueues, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, + to_next, n_left_to_next, + bi0, bi1, next0, next1); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + u32 next0 = FLOWPROBE_NEXT_DROP; + u16 len0; + + /* speculatively enqueue b0 to the current next frame */ + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + vnet_feature_next (vnet_buffer (b0)->sw_if_index[VLIB_TX], + &next0, b0); + + len0 = vlib_buffer_length_in_chain (vm, b0); + ethernet_header_t *eh0 = vlib_buffer_get_current (b0); + u16 ethertype0 = clib_net_to_host_u16 (eh0->type); + + if (PREDICT_TRUE ((b0->flags & VLIB_BUFFER_FLOW_REPORT) == 0)) + { + flowprobe_trace_t *t = 0; + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) + && (b0->flags & VLIB_BUFFER_IS_TRACED))) + t = vlib_add_trace (vm, node, b0, sizeof (*t)); + + add_to_flow_record_state (vm, node, fm, b0, timestamp, len0, + flowprobe_get_variant + (which, fm->context[which].flags, + ethertype0), t); + } + + /* verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + return frame->n_vectors; +} + +static uword +flowprobe_ip4_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + return flowprobe_node_fn (vm, node, frame, FLOW_VARIANT_IP4); +} + +static uword +flowprobe_ip6_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + return flowprobe_node_fn (vm, node, frame, FLOW_VARIANT_IP6); +} + +static uword +flowprobe_l2_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + return flowprobe_node_fn (vm, node, frame, FLOW_VARIANT_L2); +} + +static inline void +flush_record (flowprobe_variant_t which) +{ + vlib_main_t *vm = vlib_get_main (); + vlib_buffer_t *b = flowprobe_get_buffer (vm, which); + if (b) + flowprobe_export_send (vm, b, which); +} + +void +flowprobe_flush_callback_ip4 (void) +{ + flush_record (FLOW_VARIANT_IP4); +} + +void +flowprobe_flush_callback_ip6 (void) +{ + flush_record (FLOW_VARIANT_IP6); +} + +void +flowprobe_flush_callback_l2 (void) +{ + flush_record (FLOW_VARIANT_L2); + flush_record (FLOW_VARIANT_L2_IP4); + flush_record (FLOW_VARIANT_L2_IP6); +} + + +static void +flowprobe_delete_by_index (u32 my_cpu_number, u32 poolindex) +{ + flowprobe_main_t *fm = &flowprobe_main; + flowprobe_entry_t *e; + u32 h; + + e = pool_elt_at_index (fm->pool_per_worker[my_cpu_number], poolindex); + + /* Get my index */ + h = flowprobe_hash (&e->key); + + /* Reset hash */ + fm->hash_per_worker[my_cpu_number][h] = ~0; + + pool_put_index (fm->pool_per_worker[my_cpu_number], poolindex); +} + + +/* Per worker process processing the active/passive expired entries */ +static uword +flowprobe_walker_process (vlib_main_t * vm, + vlib_node_runtime_t * rt, vlib_frame_t * f) +{ + flowprobe_main_t *fm = &flowprobe_main; + flow_report_main_t *frm = &flow_report_main; + flowprobe_entry_t *e; + + /* + * $$$$ Remove this check from here and track FRM status and disable + * this process if required. + */ + if (frm->ipfix_collector.as_u32 == 0 || frm->src_address.as_u32 == 0) + { + fm->disabled = true; + return 0; + } + fm->disabled = false; + + u32 cpu_index = os_get_thread_index (); + u32 *to_be_removed = 0, *i; + + /* + * Tick the timer when required and process the vector of expired + * timers + */ + f64 start_time = vlib_time_now (vm); + u32 count = 0; + + tw_timer_expire_timers_2t_1w_2048sl (fm->timers_per_worker[cpu_index], + start_time); + + vec_foreach (i, fm->expired_passive_per_worker[cpu_index]) + { + u32 exported = 0; + f64 now = vlib_time_now (vm); + if (now > start_time + 100e-6 + || exported > FLOW_MAXIMUM_EXPORT_ENTRIES - 1) + break; + + if (pool_is_free_index (fm->pool_per_worker[cpu_index], *i)) + { + clib_warning ("Element is %d is freed already\n", *i); + continue; + } + else + e = pool_elt_at_index (fm->pool_per_worker[cpu_index], *i); + + /* Check last update timestamp. If it is longer than passive time nuke + * entry. Otherwise restart timer with what's left + * Premature passive timer by more than 10% + */ + if ((now - e->last_updated) < (u64) (fm->passive_timer * 0.9)) + { + u64 delta = fm->passive_timer - (now - e->last_updated); + e->passive_timer_handle = tw_timer_start_2t_1w_2048sl + (fm->timers_per_worker[cpu_index], *i, 0, delta); + } + else /* Nuke entry */ + { + vec_add1 (to_be_removed, *i); + } + /* If anything to report send it to the exporter */ + if (e->packetcount && now > e->last_exported + fm->active_timer) + { + exported++; + flowprobe_export_entry (vm, e); + } + count++; + } + if (count) + vec_delete (fm->expired_passive_per_worker[cpu_index], count, 0); + + vec_foreach (i, to_be_removed) flowprobe_delete_by_index (cpu_index, *i); + vec_free (to_be_removed); + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (flowprobe_ip4_node) = { + .function = flowprobe_ip4_node_fn, + .name = "flowprobe-ip4", + .vector_size = sizeof (u32), + .format_trace = format_flowprobe_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN(flowprobe_error_strings), + .error_strings = flowprobe_error_strings, + .n_next_nodes = FLOWPROBE_N_NEXT, + .next_nodes = FLOWPROBE_NEXT_NODES, +}; +VLIB_REGISTER_NODE (flowprobe_ip6_node) = { + .function = flowprobe_ip6_node_fn, + .name = "flowprobe-ip6", + .vector_size = sizeof (u32), + .format_trace = format_flowprobe_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN(flowprobe_error_strings), + .error_strings = flowprobe_error_strings, + .n_next_nodes = FLOWPROBE_N_NEXT, + .next_nodes = FLOWPROBE_NEXT_NODES, +}; +VLIB_REGISTER_NODE (flowprobe_l2_node) = { + .function = flowprobe_l2_node_fn, + .name = "flowprobe-l2", + .vector_size = sizeof (u32), + .format_trace = format_flowprobe_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN(flowprobe_error_strings), + .error_strings = flowprobe_error_strings, + .n_next_nodes = FLOWPROBE_N_NEXT, + .next_nodes = FLOWPROBE_NEXT_NODES, +}; +VLIB_REGISTER_NODE (flowprobe_walker_node) = { + .function = flowprobe_walker_process, + .name = "flowprobe-walker", + .type = VLIB_NODE_TYPE_INPUT, + .state = VLIB_NODE_STATE_INTERRUPT, +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/gtpu.am b/src/plugins/gtpu.am new file mode 100644 index 00000000..f4cca094 --- /dev/null +++ b/src/plugins/gtpu.am @@ -0,0 +1,38 @@ +# Copyright (c) 2016 Intel and/or its affiliates. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +vppapitestplugins_LTLIBRARIES += gtpu_test_plugin.la +vppplugins_LTLIBRARIES += gtpu_plugin.la + +gtpu_plugin_la_SOURCES = \ + gtpu/gtpu_decap.c \ + gtpu/gtpu_encap.c \ + gtpu/gtpu.c \ + gtpu/gtpu_api.c + +BUILT_SOURCES += \ + gtpu/gtpu.api.h \ + gtpu/gtpu.api.json + +API_FILES += gtpu/gtpu.api + +nobase_apiinclude_HEADERS += \ + gtpu/gtpu_all_api_h.h \ + gtpu/gtpu_msg_enum.h \ + gtpu/gtpu.api.h + +gtpu_test_plugin_la_SOURCES = \ + gtpu/gtpu_test.c \ + gtpu/gtpu_plugin.api.h + +# vi:syntax=automake diff --git a/src/plugins/gtpu/gtpu.api b/src/plugins/gtpu/gtpu.api new file mode 100644 index 00000000..55ba0390 --- /dev/null +++ b/src/plugins/gtpu/gtpu.api @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2017 Intel and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** \brief Set or delete an GTPU tunnel + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param is_add - add address if non-zero, else delete + @param is_ipv6 - src_address and dst_address is ipv6 or not + @param src_address - GTPU tunnel's source address. + @param dst_address - GTPU tunnel's destination address. + @param mcast_sw_if_index - version, O-bit and C-bit (see nsh_packet.h) + @param encap_vrf_id - fib identifier used for outgoing encapsulated packets + @param decap_next_index - the index of the next node if success + @param teid - Local Tunnel Endpoint Identifier +*/ +define gtpu_add_del_tunnel +{ + u32 client_index; + u32 context; + u8 is_add; + u8 is_ipv6; + u8 src_address[16]; + u8 dst_address[16]; + u32 mcast_sw_if_index; + u32 encap_vrf_id; + u32 decap_next_index; + u32 teid; +}; + +/** \brief reply for set or delete an GTPU tunnel + @param context - sender context, to match reply w/ request + @param retval - return code + @param sw_if_index - software index of the interface +*/ +define gtpu_add_del_tunnel_reply +{ + u32 context; + i32 retval; + u32 sw_if_index; +}; + +/** \brief Dump GTPU tunnel + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param sw_if_index - software index of the interface +*/ +define gtpu_tunnel_dump +{ + u32 client_index; + u32 context; + u32 sw_if_index; +}; + +/** \brief dump details of an GTPU tunnel + @param context - sender context, to match reply w/ request + @param sw_if_index - software index of the interface + @param is_ipv6 - src_address and dst_address is ipv6 or not + @param src_address - GTPU tunnel's source address. + @param dst_address - GTPU tunnel's destination address. + @param mcast_sw_if_index - version, O-bit and C-bit (see nsh_packet.h) + @param encap_vrf_id - fib identifier used for outgoing encapsulated packets + @param decap_next_index - the index of the next node if success + @param teid - Local Tunnel Endpoint Identifier +*/ +define gtpu_tunnel_details +{ + u32 context; + u32 sw_if_index; + u8 is_ipv6; + u8 src_address[16]; + u8 dst_address[16]; + u32 mcast_sw_if_index; + u32 encap_vrf_id; + u32 decap_next_index; + u32 teid; +}; + +/** \brief Interface set gtpu-bypass request + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param sw_if_index - interface used to reach neighbor + @param is_ipv6 - if non-zero, enable ipv6-gtpu-bypass, else ipv4-gtpu-bypass + @param enable - if non-zero enable, else disable +*/ +define sw_interface_set_gtpu_bypass +{ + u32 client_index; + u32 context; + u32 sw_if_index; + u8 is_ipv6; + u8 enable; +}; + +/** \brief Interface set gtpu-bypass response + @param context - sender context, to match reply w/ request + @param retval - return code for the request +*/ +define sw_interface_set_gtpu_bypass_reply +{ + u32 context; + i32 retval; +}; + +/* + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/gtpu/gtpu.c b/src/plugins/gtpu/gtpu.c new file mode 100755 index 00000000..3dfb4210 --- /dev/null +++ b/src/plugins/gtpu/gtpu.c @@ -0,0 +1,1151 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2017 Intel and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ +#include <stdint.h> +#include <net/if.h> +#include <sys/ioctl.h> +#include <inttypes.h> + +#include <vlib/vlib.h> +#include <vlib/unix/unix.h> +#include <vnet/ethernet/ethernet.h> +#include <vnet/fib/fib_entry.h> +#include <vnet/fib/fib_table.h> +#include <vnet/mfib/mfib_table.h> +#include <vnet/adj/adj_mcast.h> +#include <vnet/dpo/dpo.h> +#include <vnet/plugin/plugin.h> +#include <vpp/app/version.h> +#include <gtpu/gtpu.h> + + +gtpu_main_t gtpu_main; + +/* *INDENT-OFF* */ +VNET_FEATURE_INIT (ip4_gtpu_bypass, static) = { + .arc_name = "ip4-unicast", + .node_name = "ip4-gtpu-bypass", + .runs_before = VNET_FEATURES ("ip4-lookup"), +}; + +VNET_FEATURE_INIT (ip6_gtpu_bypass, static) = { + .arc_name = "ip6-unicast", + .node_name = "ip6-gtpu-bypass", + .runs_before = VNET_FEATURES ("ip6-lookup"), +}; +/* *INDENT-on* */ + +static u8 * +format_decap_next (u8 * s, va_list * args) +{ + u32 next_index = va_arg (*args, u32); + + switch (next_index) + { + case GTPU_INPUT_NEXT_DROP: + return format (s, "drop"); + case GTPU_INPUT_NEXT_L2_INPUT: + return format (s, "l2"); + case GTPU_INPUT_NEXT_IP4_INPUT: + return format (s, "ip4"); + case GTPU_INPUT_NEXT_IP6_INPUT: + return format (s, "ip6"); + default: + return format (s, "index %d", next_index); + } + return s; +} + +u8 * +format_gtpu_tunnel (u8 * s, va_list * args) +{ + gtpu_tunnel_t *t = va_arg (*args, gtpu_tunnel_t *); + gtpu_main_t *ngm = >pu_main; + + s = format (s, "[%d] src %U dst %U teid %d sw_if_index %d ", + t - ngm->tunnels, + format_ip46_address, &t->src, IP46_TYPE_ANY, + format_ip46_address, &t->dst, IP46_TYPE_ANY, + t->teid, t->sw_if_index); + + if (ip46_address_is_multicast (&t->dst)) + s = format (s, "mcast_sw_if_index %d ", t->mcast_sw_if_index); + + s = format (s, "encap_fib_index %d fib_entry_index %d decap_next %U\n", + t->encap_fib_index, t->fib_entry_index, + format_decap_next, t->decap_next_index); + return s; +} + +static u8 * +format_gtpu_name (u8 * s, va_list * args) +{ + u32 dev_instance = va_arg (*args, u32); + return format (s, "gtpu_tunnel%d", dev_instance); +} + +static uword +dummy_interface_tx (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + clib_warning ("you shouldn't be here, leaking buffers..."); + return frame->n_vectors; +} + +static clib_error_t * +gtpu_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags) +{ + u32 hw_flags = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) ? + VNET_HW_INTERFACE_FLAG_LINK_UP : 0; + vnet_hw_interface_set_flags (vnm, hw_if_index, hw_flags); + + return /* no error */ 0; +} + +/* *INDENT-OFF* */ +VNET_DEVICE_CLASS (gtpu_device_class,static) = { + .name = "GTPU", + .format_device_name = format_gtpu_name, + .format_tx_trace = format_gtpu_encap_trace, + .tx_function = dummy_interface_tx, + .admin_up_down_function = gtpu_interface_admin_up_down, +}; +/* *INDENT-ON* */ + +static u8 * +format_gtpu_header_with_length (u8 * s, va_list * args) +{ + u32 dev_instance = va_arg (*args, u32); + s = format (s, "unimplemented dev %u", dev_instance); + return s; +} + +/* *INDENT-OFF* */ +VNET_HW_INTERFACE_CLASS (gtpu_hw_class) = +{ + .name = "GTPU", + .format_header = format_gtpu_header_with_length, + .build_rewrite = default_build_rewrite, +}; +/* *INDENT-ON* */ + +static void +gtpu_tunnel_restack_dpo (gtpu_tunnel_t * t) +{ + dpo_id_t dpo = DPO_INVALID; + u32 encap_index = ip46_address_is_ip4 (&t->dst) ? + gtpu4_encap_node.index : gtpu6_encap_node.index; + fib_forward_chain_type_t forw_type = ip46_address_is_ip4 (&t->dst) ? + FIB_FORW_CHAIN_TYPE_UNICAST_IP4 : FIB_FORW_CHAIN_TYPE_UNICAST_IP6; + + fib_entry_contribute_forwarding (t->fib_entry_index, forw_type, &dpo); + dpo_stack_from_node (encap_index, &t->next_dpo, &dpo); + dpo_reset (&dpo); +} + +static gtpu_tunnel_t * +gtpu_tunnel_from_fib_node (fib_node_t * node) +{ + return ((gtpu_tunnel_t *) (((char *) node) - + STRUCT_OFFSET_OF (gtpu_tunnel_t, node))); +} + +/** + * Function definition to backwalk a FIB node - + * Here we will restack the new dpo of GTPU DIP to encap node. + */ +static fib_node_back_walk_rc_t +gtpu_tunnel_back_walk (fib_node_t * node, fib_node_back_walk_ctx_t * ctx) +{ + gtpu_tunnel_restack_dpo (gtpu_tunnel_from_fib_node (node)); + return (FIB_NODE_BACK_WALK_CONTINUE); +} + +/** + * Function definition to get a FIB node from its index + */ +static fib_node_t * +gtpu_tunnel_fib_node_get (fib_node_index_t index) +{ + gtpu_tunnel_t *t; + gtpu_main_t *gtm = >pu_main; + + t = pool_elt_at_index (gtm->tunnels, index); + + return (&t->node); +} + +/** + * Function definition to inform the FIB node that its last lock has gone. + */ +static void +gtpu_tunnel_last_lock_gone (fib_node_t * node) +{ + /* + * The GTPU tunnel is a root of the graph. As such + * it never has children and thus is never locked. + */ + ASSERT (0); +} + +/* + * Virtual function table registered by GTPU tunnels + * for participation in the FIB object graph. + */ +const static fib_node_vft_t gtpu_vft = { + .fnv_get = gtpu_tunnel_fib_node_get, + .fnv_last_lock = gtpu_tunnel_last_lock_gone, + .fnv_back_walk = gtpu_tunnel_back_walk, +}; + + +#define foreach_copy_field \ +_(teid) \ +_(mcast_sw_if_index) \ +_(encap_fib_index) \ +_(decap_next_index) \ +_(src) \ +_(dst) + +static void +ip_udp_gtpu_rewrite (gtpu_tunnel_t * t, bool is_ip6) +{ + union + { + ip4_gtpu_header_t *h4; + ip6_gtpu_header_t *h6; + u8 *rw; + } r = + { + .rw = 0}; + int len = is_ip6 ? sizeof *r.h6 : sizeof *r.h4; + + vec_validate_aligned (r.rw, len - 1, CLIB_CACHE_LINE_BYTES); + + udp_header_t *udp; + gtpu_header_t *gtpu; + /* Fixed portion of the (outer) ip header */ + if (!is_ip6) + { + ip4_header_t *ip = &r.h4->ip4; + udp = &r.h4->udp; + gtpu = &r.h4->gtpu; + ip->ip_version_and_header_length = 0x45; + ip->ttl = 254; + ip->protocol = IP_PROTOCOL_UDP; + + ip->src_address = t->src.ip4; + ip->dst_address = t->dst.ip4; + + /* we fix up the ip4 header length and checksum after-the-fact */ + ip->checksum = ip4_header_checksum (ip); + } + else + { + ip6_header_t *ip = &r.h6->ip6; + udp = &r.h6->udp; + gtpu = &r.h6->gtpu; + ip->ip_version_traffic_class_and_flow_label = + clib_host_to_net_u32 (6 << 28); + ip->hop_limit = 255; + ip->protocol = IP_PROTOCOL_UDP; + + ip->src_address = t->src.ip6; + ip->dst_address = t->dst.ip6; + } + + /* UDP header, randomize src port on something, maybe? */ + udp->src_port = clib_host_to_net_u16 (2152); + udp->dst_port = clib_host_to_net_u16 (UDP_DST_PORT_GTPU); + + /* GTPU header */ + gtpu->ver_flags = GTPU_V1_VER | GTPU_PT_GTP; + gtpu->type = GTPU_TYPE_GTPU; + gtpu->teid = clib_host_to_net_u32 (t->teid); + + t->rewrite = r.rw; + /* Now only support 8-byte gtpu header. TBD */ + _vec_len (t->rewrite) = sizeof (ip4_gtpu_header_t) - 4; + + return; +} + +static bool +gtpu_decap_next_is_valid (gtpu_main_t * gtm, u32 is_ip6, u32 decap_next_index) +{ + vlib_main_t *vm = gtm->vlib_main; + u32 input_idx = (!is_ip6) ? gtpu4_input_node.index : gtpu6_input_node.index; + vlib_node_runtime_t *r = vlib_node_get_runtime (vm, input_idx); + + return decap_next_index < r->n_next_nodes; +} + +static void +hash_set_key_copy (uword ** h, void *key, uword v) +{ + size_t ksz = hash_header (*h)->user; + void *copy = clib_mem_alloc (ksz); + clib_memcpy (copy, key, ksz); + hash_set_mem (*h, copy, v); +} + +static void +hash_unset_key_free (uword ** h, void *key) +{ + hash_pair_t *hp = hash_get_pair_mem (*h, key); + ASSERT (hp); + key = uword_to_pointer (hp->key, void *); + hash_unset_mem (*h, key); + clib_mem_free (key); +} + +static uword +vtep_addr_ref (ip46_address_t * ip) +{ + uword *vtep = ip46_address_is_ip4 (ip) ? + hash_get (gtpu_main.vtep4, ip->ip4.as_u32) : + hash_get_mem (gtpu_main.vtep6, &ip->ip6); + if (vtep) + return ++(*vtep); + ip46_address_is_ip4 (ip) ? + hash_set (gtpu_main.vtep4, ip->ip4.as_u32, 1) : + hash_set_key_copy (>pu_main.vtep6, &ip->ip6, 1); + return 1; +} + +static uword +vtep_addr_unref (ip46_address_t * ip) +{ + uword *vtep = ip46_address_is_ip4 (ip) ? + hash_get (gtpu_main.vtep4, ip->ip4.as_u32) : + hash_get_mem (gtpu_main.vtep6, &ip->ip6); + ASSERT (vtep); + if (--(*vtep) != 0) + return *vtep; + ip46_address_is_ip4 (ip) ? + hash_unset (gtpu_main.vtep4, ip->ip4.as_u32) : + hash_unset_key_free (>pu_main.vtep6, &ip->ip6); + return 0; +} + +typedef CLIB_PACKED (union + { + struct + { + fib_node_index_t mfib_entry_index; + adj_index_t mcast_adj_index; + }; u64 as_u64; + }) mcast_shared_t; + +static inline mcast_shared_t +mcast_shared_get (ip46_address_t * ip) +{ + ASSERT (ip46_address_is_multicast (ip)); + uword *p = hash_get_mem (gtpu_main.mcast_shared, ip); + ASSERT (p); + return (mcast_shared_t) + { + .as_u64 = *p}; +} + +static inline void +mcast_shared_add (ip46_address_t * dst, fib_node_index_t mfei, adj_index_t ai) +{ + mcast_shared_t new_ep = { + .mcast_adj_index = ai, + .mfib_entry_index = mfei, + }; + + hash_set_key_copy (>pu_main.mcast_shared, dst, new_ep.as_u64); +} + +static inline void +mcast_shared_remove (ip46_address_t * dst) +{ + mcast_shared_t ep = mcast_shared_get (dst); + + adj_unlock (ep.mcast_adj_index); + mfib_table_entry_delete_index (ep.mfib_entry_index, MFIB_SOURCE_GTPU); + + hash_unset_key_free (>pu_main.mcast_shared, dst); +} + +static inline fib_protocol_t +fib_ip_proto (bool is_ip6) +{ + return (is_ip6) ? FIB_PROTOCOL_IP6 : FIB_PROTOCOL_IP4; +} + +int vnet_gtpu_add_del_tunnel + (vnet_gtpu_add_del_tunnel_args_t * a, u32 * sw_if_indexp) +{ + gtpu_main_t *gtm = >pu_main; + gtpu_tunnel_t *t = 0; + vnet_main_t *vnm = gtm->vnet_main; + uword *p; + u32 hw_if_index = ~0; + u32 sw_if_index = ~0; + gtpu4_tunnel_key_t key4; + gtpu6_tunnel_key_t key6; + u32 is_ip6 = a->is_ip6; + + if (!is_ip6) + { + key4.src = a->dst.ip4.as_u32; /* decap src in key is encap dst in config */ + key4.teid = clib_host_to_net_u32 (a->teid); + p = hash_get (gtm->gtpu4_tunnel_by_key, key4.as_u64); + } + else + { + key6.src = a->dst.ip6; + key6.teid = clib_host_to_net_u32 (a->teid); + p = hash_get_mem (gtm->gtpu6_tunnel_by_key, &key6); + } + + if (a->is_add) + { + l2input_main_t *l2im = &l2input_main; + + /* adding a tunnel: tunnel must not already exist */ + if (p) + return VNET_API_ERROR_TUNNEL_EXIST; + + /*if not set explicitly, default to l2 */ + if (a->decap_next_index == ~0) + a->decap_next_index = GTPU_INPUT_NEXT_L2_INPUT; + if (!gtpu_decap_next_is_valid (gtm, is_ip6, a->decap_next_index)) + return VNET_API_ERROR_INVALID_DECAP_NEXT; + + pool_get_aligned (gtm->tunnels, t, CLIB_CACHE_LINE_BYTES); + memset (t, 0, sizeof (*t)); + + /* copy from arg structure */ +#define _(x) t->x = a->x; + foreach_copy_field; +#undef _ + + ip_udp_gtpu_rewrite (t, is_ip6); + + /* copy the key */ + if (is_ip6) + hash_set_key_copy (>m->gtpu6_tunnel_by_key, &key6, + t - gtm->tunnels); + else + hash_set (gtm->gtpu4_tunnel_by_key, key4.as_u64, t - gtm->tunnels); + + vnet_hw_interface_t *hi; + if (vec_len (gtm->free_gtpu_tunnel_hw_if_indices) > 0) + { + vnet_interface_main_t *im = &vnm->interface_main; + hw_if_index = gtm->free_gtpu_tunnel_hw_if_indices + [vec_len (gtm->free_gtpu_tunnel_hw_if_indices) - 1]; + _vec_len (gtm->free_gtpu_tunnel_hw_if_indices) -= 1; + + hi = vnet_get_hw_interface (vnm, hw_if_index); + hi->dev_instance = t - gtm->tunnels; + hi->hw_instance = hi->dev_instance; + + /* clear old stats of freed tunnel before reuse */ + sw_if_index = hi->sw_if_index; + vnet_interface_counter_lock (im); + vlib_zero_combined_counter + (&im->combined_sw_if_counters[VNET_INTERFACE_COUNTER_TX], + sw_if_index); + vlib_zero_combined_counter (&im->combined_sw_if_counters + [VNET_INTERFACE_COUNTER_RX], + sw_if_index); + vlib_zero_simple_counter (&im->sw_if_counters + [VNET_INTERFACE_COUNTER_DROP], + sw_if_index); + vnet_interface_counter_unlock (im); + } + else + { + hw_if_index = vnet_register_interface + (vnm, gtpu_device_class.index, t - gtm->tunnels, + gtpu_hw_class.index, t - gtm->tunnels); + hi = vnet_get_hw_interface (vnm, hw_if_index); + } + + t->hw_if_index = hw_if_index; + t->sw_if_index = sw_if_index = hi->sw_if_index; + + vec_validate_init_empty (gtm->tunnel_index_by_sw_if_index, sw_if_index, + ~0); + gtm->tunnel_index_by_sw_if_index[sw_if_index] = t - gtm->tunnels; + + /* setup l2 input config with l2 feature and bd 0 to drop packet */ + vec_validate (l2im->configs, sw_if_index); + l2im->configs[sw_if_index].feature_bitmap = L2INPUT_FEAT_DROP; + l2im->configs[sw_if_index].bd_index = 0; + + vnet_sw_interface_t *si = vnet_get_sw_interface (vnm, sw_if_index); + si->flags &= ~VNET_SW_INTERFACE_FLAG_HIDDEN; + vnet_sw_interface_set_flags (vnm, sw_if_index, + VNET_SW_INTERFACE_FLAG_ADMIN_UP); + + fib_node_init (&t->node, gtm->fib_node_type); + fib_prefix_t tun_dst_pfx; + u32 encap_index = !is_ip6 ? + gtpu4_encap_node.index : gtpu6_encap_node.index; + vnet_flood_class_t flood_class = VNET_FLOOD_CLASS_TUNNEL_NORMAL; + + fib_prefix_from_ip46_addr (&t->dst, &tun_dst_pfx); + if (!ip46_address_is_multicast (&t->dst)) + { + /* Unicast tunnel - + * source the FIB entry for the tunnel's destination + * and become a child thereof. The tunnel will then get poked + * when the forwarding for the entry updates, and the tunnel can + * re-stack accordingly + */ + vtep_addr_ref (&t->src); + t->fib_entry_index = fib_table_entry_special_add + (t->encap_fib_index, &tun_dst_pfx, FIB_SOURCE_RR, + FIB_ENTRY_FLAG_NONE); + t->sibling_index = fib_entry_child_add + (t->fib_entry_index, gtm->fib_node_type, t - gtm->tunnels); + gtpu_tunnel_restack_dpo (t); + } + else + { + /* Multicast tunnel - + * as the same mcast group can be used for mutiple mcast tunnels + * with different VNIs, create the output fib adjecency only if + * it does not already exist + */ + fib_protocol_t fp = fib_ip_proto (is_ip6); + + if (vtep_addr_ref (&t->dst) == 1) + { + fib_node_index_t mfei; + adj_index_t ai; + fib_route_path_t path = { + .frp_proto = fib_proto_to_dpo (fp), + .frp_addr = zero_addr, + .frp_sw_if_index = 0xffffffff, + .frp_fib_index = ~0, + .frp_weight = 0, + .frp_flags = FIB_ROUTE_PATH_LOCAL, + }; + const mfib_prefix_t mpfx = { + .fp_proto = fp, + .fp_len = (is_ip6 ? 128 : 32), + .fp_grp_addr = tun_dst_pfx.fp_addr, + }; + + /* + * Setup the (*,G) to receive traffic on the mcast group + * - the forwarding interface is for-us + * - the accepting interface is that from the API + */ + mfib_table_entry_path_update (t->encap_fib_index, + &mpfx, + MFIB_SOURCE_GTPU, + &path, MFIB_ITF_FLAG_FORWARD); + + path.frp_sw_if_index = a->mcast_sw_if_index; + path.frp_flags = FIB_ROUTE_PATH_FLAG_NONE; + mfei = mfib_table_entry_path_update (t->encap_fib_index, + &mpfx, + MFIB_SOURCE_GTPU, + &path, + MFIB_ITF_FLAG_ACCEPT); + + /* + * Create the mcast adjacency to send traffic to the group + */ + ai = adj_mcast_add_or_lock (fp, + fib_proto_to_link (fp), + a->mcast_sw_if_index); + + /* + * create a new end-point + */ + mcast_shared_add (&t->dst, mfei, ai); + } + + dpo_id_t dpo = DPO_INVALID; + mcast_shared_t ep = mcast_shared_get (&t->dst); + + /* Stack shared mcast dst mac addr rewrite on encap */ + dpo_set (&dpo, DPO_ADJACENCY_MCAST, + fib_proto_to_dpo (fp), ep.mcast_adj_index); + + dpo_stack_from_node (encap_index, &t->next_dpo, &dpo); + + dpo_reset (&dpo); + flood_class = VNET_FLOOD_CLASS_TUNNEL_MASTER; + } + + /* Set gtpu tunnel output node */ + hi->output_node_index = encap_index; + + vnet_get_sw_interface (vnet_get_main (), sw_if_index)->flood_class = + flood_class; + } + else + { + /* deleting a tunnel: tunnel must exist */ + if (!p) + return VNET_API_ERROR_NO_SUCH_ENTRY; + + t = pool_elt_at_index (gtm->tunnels, p[0]); + sw_if_index = t->sw_if_index; + + vnet_sw_interface_set_flags (vnm, t->sw_if_index, 0 /* down */ ); + vnet_sw_interface_t *si = vnet_get_sw_interface (vnm, t->sw_if_index); + si->flags |= VNET_SW_INTERFACE_FLAG_HIDDEN; + + /* make sure tunnel is removed from l2 bd or xconnect */ + set_int_l2_mode (gtm->vlib_main, vnm, MODE_L3, t->sw_if_index, 0, 0, 0, + 0); + vec_add1 (gtm->free_gtpu_tunnel_hw_if_indices, t->hw_if_index); + + gtm->tunnel_index_by_sw_if_index[t->sw_if_index] = ~0; + + if (!is_ip6) + hash_unset (gtm->gtpu4_tunnel_by_key, key4.as_u64); + else + hash_unset_key_free (>m->gtpu6_tunnel_by_key, &key6); + + if (!ip46_address_is_multicast (&t->dst)) + { + vtep_addr_unref (&t->src); + fib_entry_child_remove (t->fib_entry_index, t->sibling_index); + fib_table_entry_delete_index (t->fib_entry_index, FIB_SOURCE_RR); + } + else if (vtep_addr_unref (&t->dst) == 0) + { + mcast_shared_remove (&t->dst); + } + + fib_node_deinit (&t->node); + vec_free (t->rewrite); + pool_put (gtm->tunnels, t); + } + + if (sw_if_indexp) + *sw_if_indexp = sw_if_index; + + return 0; +} + +static uword +get_decap_next_for_node (u32 node_index, u32 ipv4_set) +{ + gtpu_main_t *gtm = >pu_main; + vlib_main_t *vm = gtm->vlib_main; + uword input_node = (ipv4_set) ? gtpu4_input_node.index : + gtpu6_input_node.index; + + return vlib_node_add_next (vm, input_node, node_index); +} + +static uword +unformat_decap_next (unformat_input_t * input, va_list * args) +{ + u32 *result = va_arg (*args, u32 *); + u32 ipv4_set = va_arg (*args, int); + gtpu_main_t *gtm = >pu_main; + vlib_main_t *vm = gtm->vlib_main; + u32 node_index; + u32 tmp; + + if (unformat (input, "l2")) + *result = GTPU_INPUT_NEXT_L2_INPUT; + else if (unformat (input, "ip4")) + *result = GTPU_INPUT_NEXT_IP4_INPUT; + else if (unformat (input, "ip6")) + *result = GTPU_INPUT_NEXT_IP6_INPUT; + else if (unformat (input, "node %U", unformat_vlib_node, vm, &node_index)) + *result = get_decap_next_for_node (node_index, ipv4_set); + else if (unformat (input, "%d", &tmp)) + *result = tmp; + else + return 0; + + return 1; +} + +static clib_error_t * +gtpu_add_del_tunnel_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + ip46_address_t src, dst; + u8 is_add = 1; + u8 src_set = 0; + u8 dst_set = 0; + u8 grp_set = 0; + u8 ipv4_set = 0; + u8 ipv6_set = 0; + u32 encap_fib_index = 0; + u32 mcast_sw_if_index = ~0; + u32 decap_next_index = GTPU_INPUT_NEXT_L2_INPUT; + u32 teid = 0; + u32 tmp; + int rv; + vnet_gtpu_add_del_tunnel_args_t _a, *a = &_a; + u32 tunnel_sw_if_index; + clib_error_t *error = NULL; + + /* Cant "universally zero init" (={0}) due to GCC bug 53119 */ + memset (&src, 0, sizeof src); + memset (&dst, 0, sizeof dst); + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "del")) + { + is_add = 0; + } + else if (unformat (line_input, "src %U", + unformat_ip4_address, &src.ip4)) + { + src_set = 1; + ipv4_set = 1; + } + else if (unformat (line_input, "dst %U", + unformat_ip4_address, &dst.ip4)) + { + dst_set = 1; + ipv4_set = 1; + } + else if (unformat (line_input, "src %U", + unformat_ip6_address, &src.ip6)) + { + src_set = 1; + ipv6_set = 1; + } + else if (unformat (line_input, "dst %U", + unformat_ip6_address, &dst.ip6)) + { + dst_set = 1; + ipv6_set = 1; + } + else if (unformat (line_input, "group %U %U", + unformat_ip4_address, &dst.ip4, + unformat_vnet_sw_interface, + vnet_get_main (), &mcast_sw_if_index)) + { + grp_set = dst_set = 1; + ipv4_set = 1; + } + else if (unformat (line_input, "group %U %U", + unformat_ip6_address, &dst.ip6, + unformat_vnet_sw_interface, + vnet_get_main (), &mcast_sw_if_index)) + { + grp_set = dst_set = 1; + ipv6_set = 1; + } + else if (unformat (line_input, "encap-vrf-id %d", &tmp)) + { + encap_fib_index = fib_table_find (fib_ip_proto (ipv6_set), tmp); + if (encap_fib_index == ~0) + { + error = + clib_error_return (0, "nonexistent encap-vrf-id %d", tmp); + goto done; + } + } + else if (unformat (line_input, "decap-next %U", unformat_decap_next, + &decap_next_index, ipv4_set)) + ; + else if (unformat (line_input, "teid %d", &teid)) + ; + else + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } + } + + if (src_set == 0) + { + error = clib_error_return (0, "tunnel src address not specified"); + goto done; + } + + if (dst_set == 0) + { + error = clib_error_return (0, "tunnel dst address not specified"); + goto done; + } + + if (grp_set && !ip46_address_is_multicast (&dst)) + { + error = clib_error_return (0, "tunnel group address not multicast"); + goto done; + } + + if (grp_set == 0 && ip46_address_is_multicast (&dst)) + { + error = clib_error_return (0, "dst address must be unicast"); + goto done; + } + + if (grp_set && mcast_sw_if_index == ~0) + { + error = clib_error_return (0, "tunnel nonexistent multicast device"); + goto done; + } + + if (ipv4_set && ipv6_set) + { + error = clib_error_return (0, "both IPv4 and IPv6 addresses specified"); + goto done; + } + + if (ip46_address_cmp (&src, &dst) == 0) + { + error = clib_error_return (0, "src and dst addresses are identical"); + goto done; + } + + if (decap_next_index == ~0) + { + error = clib_error_return (0, "next node not found"); + goto done; + } + + memset (a, 0, sizeof (*a)); + + a->is_add = is_add; + a->is_ip6 = ipv6_set; + +#define _(x) a->x = x; + foreach_copy_field; +#undef _ + + rv = vnet_gtpu_add_del_tunnel (a, &tunnel_sw_if_index); + + switch (rv) + { + case 0: + if (is_add) + vlib_cli_output (vm, "%U\n", format_vnet_sw_if_index_name, + vnet_get_main (), tunnel_sw_if_index); + break; + + case VNET_API_ERROR_TUNNEL_EXIST: + error = clib_error_return (0, "tunnel already exists..."); + goto done; + + case VNET_API_ERROR_NO_SUCH_ENTRY: + error = clib_error_return (0, "tunnel does not exist..."); + goto done; + + default: + error = clib_error_return + (0, "vnet_gtpu_add_del_tunnel returned %d", rv); + goto done; + } + +done: + unformat_free (line_input); + + return error; +} + +/*? + * Add or delete a GTPU Tunnel. + * + * GTPU provides the features needed to allow L2 bridge domains (BDs) + * to span multiple servers. This is done by building an L2 overlay on + * top of an L3 network underlay using GTPU tunnels. + * + * This makes it possible for servers to be co-located in the same data + * center or be separated geographically as long as they are reachable + * through the underlay L3 network. + * + * You can refer to this kind of L2 overlay bridge domain as a GTPU + * (Virtual eXtensible VLAN) segment. + * + * @cliexpar + * Example of how to create a GTPU Tunnel: + * @cliexcmd{create gtpu tunnel src 10.0.3.1 dst 10.0.3.3 teid 13 encap-vrf-id 7} + * Example of how to delete a GTPU Tunnel: + * @cliexcmd{create gtpu tunnel src 10.0.3.1 dst 10.0.3.3 teid 13 del} + ?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (create_gtpu_tunnel_command, static) = { + .path = "create gtpu tunnel", + .short_help = + "create gtpu tunnel src <local-vtep-addr>" + " {dst <remote-vtep-addr>|group <mcast-vtep-addr> <intf-name>} teid <nn>" + " [encap-vrf-id <nn>] [decap-next [l2|ip4|ip6|node <name>]] [del]", + .function = gtpu_add_del_tunnel_command_fn, +}; +/* *INDENT-ON* */ + +static clib_error_t * +show_gtpu_tunnel_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + gtpu_main_t *gtm = >pu_main; + gtpu_tunnel_t *t; + + if (pool_elts (gtm->tunnels) == 0) + vlib_cli_output (vm, "No gtpu tunnels configured..."); + + pool_foreach (t, gtm->tunnels, ( + { + vlib_cli_output (vm, "%U", + format_gtpu_tunnel, t); + } + )); + + return 0; +} + +/*? + * Display all the GTPU Tunnel entries. + * + * @cliexpar + * Example of how to display the GTPU Tunnel entries: + * @cliexstart{show gtpu tunnel} + * [0] src 10.0.3.1 dst 10.0.3.3 teid 13 encap_fib_index 0 sw_if_index 5 decap_next l2 + * @cliexend + ?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_gtpu_tunnel_command, static) = { + .path = "show gtpu tunnel", + .short_help = "show gtpu tunnel", + .function = show_gtpu_tunnel_command_fn, +}; +/* *INDENT-ON* */ + +void +vnet_int_gtpu_bypass_mode (u32 sw_if_index, u8 is_ip6, u8 is_enable) +{ + if (is_ip6) + vnet_feature_enable_disable ("ip6-unicast", "ip6-gtpu-bypass", + sw_if_index, is_enable, 0, 0); + else + vnet_feature_enable_disable ("ip4-unicast", "ip4-gtpu-bypass", + sw_if_index, is_enable, 0, 0); +} + +static clib_error_t * +set_ip_gtpu_bypass (u32 is_ip6, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + vnet_main_t *vnm = vnet_get_main (); + clib_error_t *error = 0; + u32 sw_if_index, is_enable; + + sw_if_index = ~0; + is_enable = 1; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat_user + (line_input, unformat_vnet_sw_interface, vnm, &sw_if_index)) + ; + else if (unformat (line_input, "del")) + is_enable = 0; + else + { + error = unformat_parse_error (line_input); + goto done; + } + } + + if (~0 == sw_if_index) + { + error = clib_error_return (0, "unknown interface `%U'", + format_unformat_error, line_input); + goto done; + } + + vnet_int_gtpu_bypass_mode (sw_if_index, is_ip6, is_enable); + +done: + unformat_free (line_input); + + return error; +} + +static clib_error_t * +set_ip4_gtpu_bypass (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + return set_ip_gtpu_bypass (0, input, cmd); +} + +/*? + * This command adds the 'ip4-gtpu-bypass' graph node for a given interface. + * By adding the IPv4 gtpu-bypass graph node to an interface, the node checks + * for and validate input gtpu packet and bypass ip4-lookup, ip4-local, + * ip4-udp-lookup nodes to speedup gtpu packet forwarding. This node will + * cause extra overhead to for non-gtpu packets which is kept at a minimum. + * + * @cliexpar + * @parblock + * Example of graph node before ip4-gtpu-bypass is enabled: + * @cliexstart{show vlib graph ip4-gtpu-bypass} + * Name Next Previous + * ip4-gtpu-bypass error-drop [0] + * gtpu4-input [1] + * ip4-lookup [2] + * @cliexend + * + * Example of how to enable ip4-gtpu-bypass on an interface: + * @cliexcmd{set interface ip gtpu-bypass GigabitEthernet2/0/0} + * + * Example of graph node after ip4-gtpu-bypass is enabled: + * @cliexstart{show vlib graph ip4-gtpu-bypass} + * Name Next Previous + * ip4-gtpu-bypass error-drop [0] ip4-input + * gtpu4-input [1] ip4-input-no-checksum + * ip4-lookup [2] + * @cliexend + * + * Example of how to display the feature enabed on an interface: + * @cliexstart{show ip interface features GigabitEthernet2/0/0} + * IP feature paths configured on GigabitEthernet2/0/0... + * ... + * ipv4 unicast: + * ip4-gtpu-bypass + * ip4-lookup + * ... + * @cliexend + * + * Example of how to disable ip4-gtpu-bypass on an interface: + * @cliexcmd{set interface ip gtpu-bypass GigabitEthernet2/0/0 del} + * @endparblock +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (set_interface_ip_gtpu_bypass_command, static) = { + .path = "set interface ip gtpu-bypass", + .function = set_ip4_gtpu_bypass, + .short_help = "set interface ip gtpu-bypass <interface> [del]", +}; +/* *INDENT-ON* */ + +static clib_error_t * +set_ip6_gtpu_bypass (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + return set_ip_gtpu_bypass (1, input, cmd); +} + +/*? + * This command adds the 'ip6-gtpu-bypass' graph node for a given interface. + * By adding the IPv6 gtpu-bypass graph node to an interface, the node checks + * for and validate input gtpu packet and bypass ip6-lookup, ip6-local, + * ip6-udp-lookup nodes to speedup gtpu packet forwarding. This node will + * cause extra overhead to for non-gtpu packets which is kept at a minimum. + * + * @cliexpar + * @parblock + * Example of graph node before ip6-gtpu-bypass is enabled: + * @cliexstart{show vlib graph ip6-gtpu-bypass} + * Name Next Previous + * ip6-gtpu-bypass error-drop [0] + * gtpu6-input [1] + * ip6-lookup [2] + * @cliexend + * + * Example of how to enable ip6-gtpu-bypass on an interface: + * @cliexcmd{set interface ip6 gtpu-bypass GigabitEthernet2/0/0} + * + * Example of graph node after ip6-gtpu-bypass is enabled: + * @cliexstart{show vlib graph ip6-gtpu-bypass} + * Name Next Previous + * ip6-gtpu-bypass error-drop [0] ip6-input + * gtpu6-input [1] ip4-input-no-checksum + * ip6-lookup [2] + * @cliexend + * + * Example of how to display the feature enabed on an interface: + * @cliexstart{show ip interface features GigabitEthernet2/0/0} + * IP feature paths configured on GigabitEthernet2/0/0... + * ... + * ipv6 unicast: + * ip6-gtpu-bypass + * ip6-lookup + * ... + * @cliexend + * + * Example of how to disable ip6-gtpu-bypass on an interface: + * @cliexcmd{set interface ip6 gtpu-bypass GigabitEthernet2/0/0 del} + * @endparblock +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (set_interface_ip6_gtpu_bypass_command, static) = { + .path = "set interface ip6 gtpu-bypass", + .function = set_ip6_gtpu_bypass, + .short_help = "set interface ip gtpu-bypass <interface> [del]", +}; +/* *INDENT-ON* */ + +clib_error_t * +gtpu_init (vlib_main_t * vm) +{ + gtpu_main_t *gtm = >pu_main; + + gtm->vnet_main = vnet_get_main (); + gtm->vlib_main = vm; + + /* initialize the ip6 hash */ + gtm->gtpu6_tunnel_by_key = hash_create_mem (0, + sizeof (gtpu6_tunnel_key_t), + sizeof (uword)); + gtm->vtep6 = hash_create_mem (0, sizeof (ip6_address_t), sizeof (uword)); + gtm->mcast_shared = hash_create_mem (0, + sizeof (ip46_address_t), + sizeof (mcast_shared_t)); + + udp_register_dst_port (vm, UDP_DST_PORT_GTPU, + gtpu4_input_node.index, /* is_ip4 */ 1); + udp_register_dst_port (vm, UDP_DST_PORT_GTPU6, + gtpu6_input_node.index, /* is_ip4 */ 0); + + gtm->fib_node_type = fib_node_register_new_type (>pu_vft); + + return 0; +} + +VLIB_INIT_FUNCTION (gtpu_init); + +/* *INDENT-OFF* */ +VLIB_PLUGIN_REGISTER () = { + .version = VPP_BUILD_VER, + .description = "GTPv1-U", +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/gtpu/gtpu.h b/src/plugins/gtpu/gtpu.h new file mode 100644 index 00000000..744d21d4 --- /dev/null +++ b/src/plugins/gtpu/gtpu.h @@ -0,0 +1,264 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2017 Intel and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#ifndef included_vnet_gtpu_h +#define included_vnet_gtpu_h + +#include <vppinfra/lock.h> +#include <vppinfra/error.h> +#include <vppinfra/hash.h> +#include <vnet/vnet.h> +#include <vnet/ip/ip.h> +#include <vnet/l2/l2_input.h> +#include <vnet/l2/l2_output.h> +#include <vnet/l2/l2_bd.h> +#include <vnet/ethernet/ethernet.h> +#include <vnet/ip/ip4_packet.h> +#include <vnet/ip/ip6_packet.h> +#include <vnet/udp/udp.h> +#include <vnet/dpo/dpo.h> +#include <vnet/adj/adj_types.h> +#include <vnet/fib/fib_table.h> + +/** + * Bits + * Octets 8 7 6 5 4 3 2 1 + * 1 Version PT (*) E S PN + * 2 Message Type + * 3 Length (1st Octet) + * 4 Length (2nd Octet) + * 5 Tunnel Endpoint Identifier (1st Octet) + * 6 Tunnel Endpoint Identifier (2nd Octet) + * 7 Tunnel Endpoint Identifier (3rd Octet) + * 8 Tunnel Endpoint Identifier (4th Octet) + * 9 Sequence Number (1st Octet)1) 4) + * 10 Sequence Number (2nd Octet)1) 4) + * 11 N-PDU Number2) 4) + * 12 Next Extension Header Type3) 4) +**/ + +typedef struct +{ + u8 ver_flags; + u8 type; + u16 length; /* length in octets of the payload */ + u32 teid; + u16 sequence; + u8 pdu_number; + u8 next_ext_type; +} gtpu_header_t; + +#define GTPU_VER_MASK (7<<5) +#define GTPU_PT_BIT (1<<4) +#define GTPU_E_BIT (1<<2) +#define GTPU_S_BIT (1<<1) +#define GTPU_PN_BIT (1<<0) +#define GTPU_E_S_PN_BIT (7<<0) + +#define GTPU_V1_VER (1<<5) + +#define GTPU_PT_GTP (1<<4) +#define GTPU_TYPE_GTPU 255 + +/* *INDENT-OFF* */ +typedef CLIB_PACKED(struct +{ + ip4_header_t ip4; /* 20 bytes */ + udp_header_t udp; /* 8 bytes */ + gtpu_header_t gtpu; /* 8 bytes */ +}) ip4_gtpu_header_t; +/* *INDENT-ON* */ + +/* *INDENT-OFF* */ +typedef CLIB_PACKED(struct +{ + ip6_header_t ip6; /* 40 bytes */ + udp_header_t udp; /* 8 bytes */ + gtpu_header_t gtpu; /* 8 bytes */ +}) ip6_gtpu_header_t; +/* *INDENT-ON* */ + +/* *INDENT-OFF* */ +typedef CLIB_PACKED +(struct { + /* + * Key fields: ip src and gtpu teid on incoming gtpu packet + * all fields in NET byte order + */ + union { + struct { + u32 src; + u32 teid; + }; + u64 as_u64; + }; +}) gtpu4_tunnel_key_t; +/* *INDENT-ON* */ + +/* *INDENT-OFF* */ +typedef CLIB_PACKED +(struct { + /* + * Key fields: ip src and gtpu teid on incoming gtpu packet + * all fields in NET byte order + */ + ip6_address_t src; + u32 teid; +}) gtpu6_tunnel_key_t; +/* *INDENT-ON* */ + +typedef struct +{ + /* Rewrite string */ + u8 *rewrite; + + /* FIB DPO for IP forwarding of gtpu encap packet */ + dpo_id_t next_dpo; + + /* gtpu teid in HOST byte order */ + u32 teid; + + /* tunnel src and dst addresses */ + ip46_address_t src; + ip46_address_t dst; + + /* mcast packet output intf index (used only if dst is mcast) */ + u32 mcast_sw_if_index; + + /* decap next index */ + u32 decap_next_index; + + /* The FIB index for src/dst addresses */ + u32 encap_fib_index; + + /* vnet intfc index */ + u32 sw_if_index; + u32 hw_if_index; + + /** + * Linkage into the FIB object graph + */ + fib_node_t node; + + /* + * The FIB entry for (depending on gtpu tunnel is unicast or mcast) + * sending unicast gtpu encap packets or receiving mcast gtpu packets + */ + fib_node_index_t fib_entry_index; + adj_index_t mcast_adj_index; + + /** + * The tunnel is a child of the FIB entry for its destination. This is + * so it receives updates when the forwarding information for that entry + * changes. + * The tunnels sibling index on the FIB entry's dependency list. + */ + u32 sibling_index; +} gtpu_tunnel_t; + +#define foreach_gtpu_input_next \ +_(DROP, "error-drop") \ +_(L2_INPUT, "l2-input") \ +_(IP4_INPUT, "ip4-input") \ +_(IP6_INPUT, "ip6-input" ) + +typedef enum +{ +#define _(s,n) GTPU_INPUT_NEXT_##s, + foreach_gtpu_input_next +#undef _ + GTPU_INPUT_N_NEXT, +} gtpu_input_next_t; + +typedef enum +{ +#define gtpu_error(n,s) GTPU_ERROR_##n, +#include <gtpu/gtpu_error.def> +#undef gtpu_error + GTPU_N_ERROR, +} gtpu_input_error_t; + +typedef struct +{ + /* vector of encap tunnel instances */ + gtpu_tunnel_t *tunnels; + + /* lookup tunnel by key */ + uword *gtpu4_tunnel_by_key; /* keyed on ipv4.dst + teid */ + uword *gtpu6_tunnel_by_key; /* keyed on ipv6.dst + teid */ + + /* local VTEP IPs ref count used by gtpu-bypass node to check if + received gtpu packet DIP matches any local VTEP address */ + uword *vtep4; /* local ip4 VTEPs keyed on their ip4 addr */ + uword *vtep6; /* local ip6 VTEPs keyed on their ip6 addr */ + + /* mcast shared info */ + uword *mcast_shared; /* keyed on mcast ip46 addr */ + + /* Free vlib hw_if_indices */ + u32 *free_gtpu_tunnel_hw_if_indices; + + /* Mapping from sw_if_index to tunnel index */ + u32 *tunnel_index_by_sw_if_index; + + /** + * Node type for registering to fib changes. + */ + fib_node_type_t fib_node_type; + + /* API message ID base */ + u16 msg_id_base; + + /* convenience */ + vlib_main_t *vlib_main; + vnet_main_t *vnet_main; +} gtpu_main_t; + +gtpu_main_t gtpu_main; + +extern vlib_node_registration_t gtpu4_input_node; +extern vlib_node_registration_t gtpu6_input_node; +extern vlib_node_registration_t gtpu4_encap_node; +extern vlib_node_registration_t gtpu6_encap_node; + +u8 *format_gtpu_encap_trace (u8 * s, va_list * args); + +typedef struct +{ + u8 is_add; + u8 is_ip6; + ip46_address_t src, dst; + u32 mcast_sw_if_index; + u32 encap_fib_index; + u32 decap_next_index; + u32 teid; +} vnet_gtpu_add_del_tunnel_args_t; + +int vnet_gtpu_add_del_tunnel + (vnet_gtpu_add_del_tunnel_args_t * a, u32 * sw_if_indexp); + +void vnet_int_gtpu_bypass_mode (u32 sw_if_index, u8 is_ip6, u8 is_enable); +#endif /* included_vnet_gtpu_h */ + + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/gtpu/gtpu_all_api_h.h b/src/plugins/gtpu/gtpu_all_api_h.h new file mode 100644 index 00000000..dbfe0397 --- /dev/null +++ b/src/plugins/gtpu/gtpu_all_api_h.h @@ -0,0 +1,18 @@ +/* + * gtpu_all_api_h.h - plug-in api #include file + * + * Copyright (c) 2017 Intel and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* Include the generated file, see BUILT_SOURCES in Makefile.am */ +#include <gtpu/gtpu.api.h> diff --git a/src/plugins/gtpu/gtpu_api.c b/src/plugins/gtpu/gtpu_api.c new file mode 100644 index 00000000..49a5053d --- /dev/null +++ b/src/plugins/gtpu/gtpu_api.c @@ -0,0 +1,256 @@ +/* + *------------------------------------------------------------------ + * gtpu_api.c - gtpu api + * + * Copyright (c) 2017 Intel and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#include <vnet/interface.h> +#include <vnet/api_errno.h> +#include <vnet/feature/feature.h> +#include <vnet/fib/fib_table.h> + +#include <vppinfra/byte_order.h> +#include <vlibmemory/api.h> +#include <vlibsocket/api.h> + +#include <gtpu/gtpu.h> + + +#define vl_msg_id(n,h) n, +typedef enum +{ +#include <gtpu/gtpu.api.h> + /* We'll want to know how many messages IDs we need... */ + VL_MSG_FIRST_AVAILABLE, +} vl_msg_id_t; +#undef vl_msg_id + +/* define message structures */ +#define vl_typedefs +#include <gtpu/gtpu.api.h> +#undef vl_typedefs + +/* define generated endian-swappers */ +#define vl_endianfun +#include <gtpu/gtpu.api.h> +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__) +#define vl_printfun +#include <gtpu/gtpu.api.h> +#undef vl_printfun + +/* Get the API version number */ +#define vl_api_version(n,v) static u32 api_version=(v); +#include <gtpu/gtpu.api.h> +#undef vl_api_version + +#define vl_msg_name_crc_list +#include <gtpu/gtpu.api.h> +#undef vl_msg_name_crc_list + +#define REPLY_MSG_ID_BASE gtm->msg_id_base +#include <vlibapi/api_helper_macros.h> + +static void +setup_message_id_table (gtpu_main_t * gtm, api_main_t * am) +{ +#define _(id,n,crc) \ + vl_msg_api_add_msg_name_crc (am, #n "_" #crc, id + gtm->msg_id_base); + foreach_vl_msg_name_crc_gtpu; +#undef _ +} + +#define foreach_gtpu_plugin_api_msg \ +_(SW_INTERFACE_SET_GTPU_BYPASS, sw_interface_set_gtpu_bypass) \ +_(GTPU_ADD_DEL_TUNNEL, gtpu_add_del_tunnel) \ +_(GTPU_TUNNEL_DUMP, gtpu_tunnel_dump) + +static void + vl_api_sw_interface_set_gtpu_bypass_t_handler + (vl_api_sw_interface_set_gtpu_bypass_t * mp) +{ + vl_api_sw_interface_set_gtpu_bypass_reply_t *rmp; + int rv = 0; + u32 sw_if_index = ntohl (mp->sw_if_index); + gtpu_main_t *gtm = >pu_main; + + VALIDATE_SW_IF_INDEX (mp); + + vnet_int_gtpu_bypass_mode (sw_if_index, mp->is_ipv6, mp->enable); + BAD_SW_IF_INDEX_LABEL; + + REPLY_MACRO (VL_API_SW_INTERFACE_SET_GTPU_BYPASS_REPLY); +} + +static void vl_api_gtpu_add_del_tunnel_t_handler + (vl_api_gtpu_add_del_tunnel_t * mp) +{ + vl_api_gtpu_add_del_tunnel_reply_t *rmp; + int rv = 0; + ip4_main_t *im = &ip4_main; + gtpu_main_t *gtm = >pu_main; + + uword *p = hash_get (im->fib_index_by_table_id, ntohl (mp->encap_vrf_id)); + if (!p) + { + rv = VNET_API_ERROR_NO_SUCH_FIB; + goto out; + } + + vnet_gtpu_add_del_tunnel_args_t a = { + .is_add = mp->is_add, + .is_ip6 = mp->is_ipv6, + .mcast_sw_if_index = ntohl (mp->mcast_sw_if_index), + .encap_fib_index = p[0], + .decap_next_index = ntohl (mp->decap_next_index), + .teid = ntohl (mp->teid), + .dst = to_ip46 (mp->is_ipv6, mp->dst_address), + .src = to_ip46 (mp->is_ipv6, mp->src_address), + }; + + /* Check src & dst are different */ + if (ip46_address_cmp (&a.dst, &a.src) == 0) + { + rv = VNET_API_ERROR_SAME_SRC_DST; + goto out; + } + if (ip46_address_is_multicast (&a.dst) && + !vnet_sw_if_index_is_api_valid (a.mcast_sw_if_index)) + { + rv = VNET_API_ERROR_INVALID_SW_IF_INDEX; + goto out; + } + + u32 sw_if_index = ~0; + rv = vnet_gtpu_add_del_tunnel (&a, &sw_if_index); + +out: + /* *INDENT-OFF* */ + REPLY_MACRO2(VL_API_GTPU_ADD_DEL_TUNNEL_REPLY, + ({ + rmp->sw_if_index = ntohl (sw_if_index); + })); + /* *INDENT-ON* */ +} + +static void send_gtpu_tunnel_details + (gtpu_tunnel_t * t, unix_shared_memory_queue_t * q, u32 context) +{ + vl_api_gtpu_tunnel_details_t *rmp; + ip4_main_t *im4 = &ip4_main; + ip6_main_t *im6 = &ip6_main; + u8 is_ipv6 = !ip46_address_is_ip4 (&t->dst); + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (VL_API_GTPU_TUNNEL_DETAILS); + if (is_ipv6) + { + memcpy (rmp->src_address, t->src.ip6.as_u8, 16); + memcpy (rmp->dst_address, t->dst.ip6.as_u8, 16); + rmp->encap_vrf_id = htonl (im6->fibs[t->encap_fib_index].ft_table_id); + } + else + { + memcpy (rmp->src_address, t->src.ip4.as_u8, 4); + memcpy (rmp->dst_address, t->dst.ip4.as_u8, 4); + rmp->encap_vrf_id = htonl (im4->fibs[t->encap_fib_index].ft_table_id); + } + rmp->mcast_sw_if_index = htonl (t->mcast_sw_if_index); + rmp->teid = htonl (t->teid); + rmp->decap_next_index = htonl (t->decap_next_index); + rmp->sw_if_index = htonl (t->sw_if_index); + rmp->is_ipv6 = is_ipv6; + rmp->context = context; + + vl_msg_api_send_shmem (q, (u8 *) & rmp); +} + +static void +vl_api_gtpu_tunnel_dump_t_handler (vl_api_gtpu_tunnel_dump_t * mp) +{ + unix_shared_memory_queue_t *q; + gtpu_main_t *gtm = >pu_main; + gtpu_tunnel_t *t; + u32 sw_if_index; + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (q == 0) + { + return; + } + + sw_if_index = ntohl (mp->sw_if_index); + + if (~0 == sw_if_index) + { + /* *INDENT-OFF* */ + pool_foreach (t, gtm->tunnels, + ({ + send_gtpu_tunnel_details(t, q, mp->context); + })); + /* *INDENT-ON* */ + } + else + { + if ((sw_if_index >= vec_len (gtm->tunnel_index_by_sw_if_index)) || + (~0 == gtm->tunnel_index_by_sw_if_index[sw_if_index])) + { + return; + } + t = >m->tunnels[gtm->tunnel_index_by_sw_if_index[sw_if_index]]; + send_gtpu_tunnel_details (t, q, mp->context); + } +} + + +static clib_error_t * +gtpu_api_hookup (vlib_main_t * vm) +{ + gtpu_main_t *gtm = >pu_main; + + u8 *name = format (0, "gtpu_%08x%c", api_version, 0); + gtm->msg_id_base = vl_msg_api_get_msg_ids + ((char *) name, VL_MSG_FIRST_AVAILABLE); + +#define _(N,n) \ + vl_msg_api_set_handlers((VL_API_##N + gtm->msg_id_base), \ + #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_gtpu_plugin_api_msg; +#undef _ + + /* Add our API messages to the global name_crc hash table */ + setup_message_id_table (gtm, &api_main); + + return 0; +} + +VLIB_API_INIT_FUNCTION (gtpu_api_hookup); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/gtpu/gtpu_decap.c b/src/plugins/gtpu/gtpu_decap.c new file mode 100644 index 00000000..de235889 --- /dev/null +++ b/src/plugins/gtpu/gtpu_decap.c @@ -0,0 +1,1305 @@ +/* + * decap.c: gtpu tunnel decap packet processing + * + * Copyright (c) 2017 Intel and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vlib/vlib.h> +#include <vnet/pg/pg.h> +#include <gtpu/gtpu.h> + +vlib_node_registration_t gtpu4_input_node; +vlib_node_registration_t gtpu6_input_node; + +typedef struct { + u32 next_index; + u32 tunnel_index; + u32 error; + u32 teid; +} gtpu_rx_trace_t; + +static u8 * format_gtpu_rx_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + gtpu_rx_trace_t * t = va_arg (*args, gtpu_rx_trace_t *); + + if (t->tunnel_index != ~0) + { + s = format (s, "GTPU decap from gtpu_tunnel%d teid %d next %d error %d", + t->tunnel_index, t->teid, t->next_index, t->error); + } + else + { + s = format (s, "GTPU decap error - tunnel for teid %d does not exist", + t->teid); + } + return s; +} + +always_inline u32 +validate_gtpu_fib (vlib_buffer_t *b, gtpu_tunnel_t *t, u32 is_ip4) +{ + u32 fib_index, sw_if_index; + + sw_if_index = vnet_buffer (b)->sw_if_index[VLIB_RX]; + + if (is_ip4) + fib_index = (vnet_buffer (b)->sw_if_index[VLIB_TX] == (u32) ~ 0) ? + vec_elt (ip4_main.fib_index_by_sw_if_index, sw_if_index) : + vnet_buffer (b)->sw_if_index[VLIB_TX]; + else + fib_index = (vnet_buffer (b)->sw_if_index[VLIB_TX] == (u32) ~ 0) ? + vec_elt (ip6_main.fib_index_by_sw_if_index, sw_if_index) : + vnet_buffer (b)->sw_if_index[VLIB_TX]; + + return (fib_index == t->encap_fib_index); +} + +always_inline uword +gtpu_input (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame, + u32 is_ip4) +{ + u32 n_left_from, next_index, * from, * to_next; + gtpu_main_t * gtm = >pu_main; + vnet_main_t * vnm = gtm->vnet_main; + vnet_interface_main_t * im = &vnm->interface_main; + u32 last_tunnel_index = ~0; + gtpu4_tunnel_key_t last_key4; + gtpu6_tunnel_key_t last_key6; + u32 pkts_decapsulated = 0; + u32 thread_index = vlib_get_thread_index(); + u32 stats_sw_if_index, stats_n_packets, stats_n_bytes; + + if (is_ip4) + last_key4.as_u64 = ~0; + else + memset (&last_key6, 0xff, sizeof (last_key6)); + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + stats_sw_if_index = node->runtime_data[0]; + stats_n_packets = stats_n_bytes = 0; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, + to_next, n_left_to_next); + while (n_left_from >= 4 && n_left_to_next >= 2) + { + u32 bi0, bi1; + vlib_buffer_t * b0, * b1; + u32 next0, next1; + ip4_header_t * ip4_0, * ip4_1; + ip6_header_t * ip6_0, * ip6_1; + gtpu_header_t * gtpu0, * gtpu1; + u32 gtpu_hdr_len0 = 0, gtpu_hdr_len1 =0 ; + uword * p0, * p1; + u32 tunnel_index0, tunnel_index1; + gtpu_tunnel_t * t0, * t1, * mt0 = NULL, * mt1 = NULL; + gtpu4_tunnel_key_t key4_0, key4_1; + gtpu6_tunnel_key_t key6_0, key6_1; + u32 error0, error1; + u32 sw_if_index0, sw_if_index1, len0, len1; + + /* Prefetch next iteration. */ + { + vlib_buffer_t * p2, * p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + + CLIB_PREFETCH (p2->data, 2*CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (p3->data, 2*CLIB_CACHE_LINE_BYTES, LOAD); + } + + bi0 = from[0]; + bi1 = from[1]; + to_next[0] = bi0; + to_next[1] = bi1; + from += 2; + to_next += 2; + n_left_to_next -= 2; + n_left_from -= 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + /* udp leaves current_data pointing at the gtpu header */ + gtpu0 = vlib_buffer_get_current (b0); + gtpu1 = vlib_buffer_get_current (b1); + if (is_ip4) { + vlib_buffer_advance + (b0, -(word)(sizeof(udp_header_t)+sizeof(ip4_header_t))); + vlib_buffer_advance + (b1, -(word)(sizeof(udp_header_t)+sizeof(ip4_header_t))); + ip4_0 = vlib_buffer_get_current (b0); + ip4_1 = vlib_buffer_get_current (b1); + } else { + vlib_buffer_advance + (b0, -(word)(sizeof(udp_header_t)+sizeof(ip6_header_t))); + vlib_buffer_advance + (b1, -(word)(sizeof(udp_header_t)+sizeof(ip6_header_t))); + ip6_0 = vlib_buffer_get_current (b0); + ip6_1 = vlib_buffer_get_current (b1); + } + + /* pop (ip, udp, gtpu) */ + if (is_ip4) { + vlib_buffer_advance + (b0, sizeof(*ip4_0)+sizeof(udp_header_t)); + vlib_buffer_advance + (b1, sizeof(*ip4_1)+sizeof(udp_header_t)); + } else { + vlib_buffer_advance + (b0, sizeof(*ip6_0)+sizeof(udp_header_t)); + vlib_buffer_advance + (b1, sizeof(*ip6_1)+sizeof(udp_header_t)); + } + + tunnel_index0 = ~0; + error0 = 0; + + tunnel_index1 = ~0; + error1 = 0; + + if (PREDICT_FALSE ((gtpu0->ver_flags & GTPU_VER_MASK) != GTPU_V1_VER)) + { + error0 = GTPU_ERROR_BAD_VER; + next0 = GTPU_INPUT_NEXT_DROP; + goto trace0; + } + + /* Manipulate packet 0 */ + if (is_ip4) { + key4_0.src = ip4_0->src_address.as_u32; + key4_0.teid = gtpu0->teid; + + /* Make sure GTPU tunnel exist according to packet SIP and teid + * SIP identify a GTPU path, and teid identify a tunnel in a given GTPU path */ + if (PREDICT_FALSE (key4_0.as_u64 != last_key4.as_u64)) + { + p0 = hash_get (gtm->gtpu4_tunnel_by_key, key4_0.as_u64); + if (PREDICT_FALSE (p0 == NULL)) + { + error0 = GTPU_ERROR_NO_SUCH_TUNNEL; + next0 = GTPU_INPUT_NEXT_DROP; + goto trace0; + } + last_key4.as_u64 = key4_0.as_u64; + tunnel_index0 = last_tunnel_index = p0[0]; + } + else + tunnel_index0 = last_tunnel_index; + t0 = pool_elt_at_index (gtm->tunnels, tunnel_index0); + + /* Validate GTPU tunnel encap-fib index agaist packet */ + if (PREDICT_FALSE (validate_gtpu_fib (b0, t0, is_ip4) == 0)) + { + error0 = GTPU_ERROR_NO_SUCH_TUNNEL; + next0 = GTPU_INPUT_NEXT_DROP; + goto trace0; + } + + /* Validate GTPU tunnel SIP against packet DIP */ + if (PREDICT_TRUE (ip4_0->dst_address.as_u32 == t0->src.ip4.as_u32)) + goto next0; /* valid packet */ + if (PREDICT_FALSE (ip4_address_is_multicast (&ip4_0->dst_address))) + { + key4_0.src = ip4_0->dst_address.as_u32; + key4_0.teid = gtpu0->teid; + /* Make sure mcast GTPU tunnel exist by packet DIP and teid */ + p0 = hash_get (gtm->gtpu4_tunnel_by_key, key4_0.as_u64); + if (PREDICT_TRUE (p0 != NULL)) + { + mt0 = pool_elt_at_index (gtm->tunnels, p0[0]); + goto next0; /* valid packet */ + } + } + error0 = GTPU_ERROR_NO_SUCH_TUNNEL; + next0 = GTPU_INPUT_NEXT_DROP; + goto trace0; + + } else /* !is_ip4 */ { + key6_0.src.as_u64[0] = ip6_0->src_address.as_u64[0]; + key6_0.src.as_u64[1] = ip6_0->src_address.as_u64[1]; + key6_0.teid = gtpu0->teid; + + /* Make sure GTPU tunnel exist according to packet SIP and teid + * SIP identify a GTPU path, and teid identify a tunnel in a given GTPU path */ + if (PREDICT_FALSE (memcmp(&key6_0, &last_key6, sizeof(last_key6)) != 0)) + { + p0 = hash_get_mem (gtm->gtpu6_tunnel_by_key, &key6_0); + if (PREDICT_FALSE (p0 == NULL)) + { + error0 = GTPU_ERROR_NO_SUCH_TUNNEL; + next0 = GTPU_INPUT_NEXT_DROP; + goto trace0; + } + clib_memcpy (&last_key6, &key6_0, sizeof(key6_0)); + tunnel_index0 = last_tunnel_index = p0[0]; + } + else + tunnel_index0 = last_tunnel_index; + t0 = pool_elt_at_index (gtm->tunnels, tunnel_index0); + + /* Validate GTPU tunnel encap-fib index agaist packet */ + if (PREDICT_FALSE (validate_gtpu_fib (b0, t0, is_ip4) == 0)) + { + error0 = GTPU_ERROR_NO_SUCH_TUNNEL; + next0 = GTPU_INPUT_NEXT_DROP; + goto trace0; + } + + /* Validate GTPU tunnel SIP against packet DIP */ + if (PREDICT_TRUE (ip6_address_is_equal (&ip6_0->dst_address, + &t0->src.ip6))) + goto next0; /* valid packet */ + if (PREDICT_FALSE (ip6_address_is_multicast (&ip6_0->dst_address))) + { + key6_0.src.as_u64[0] = ip6_0->dst_address.as_u64[0]; + key6_0.src.as_u64[1] = ip6_0->dst_address.as_u64[1]; + key6_0.teid = gtpu0->teid; + p0 = hash_get_mem (gtm->gtpu6_tunnel_by_key, &key6_0); + if (PREDICT_TRUE (p0 != NULL)) + { + mt0 = pool_elt_at_index (gtm->tunnels, p0[0]); + goto next0; /* valid packet */ + } + } + error0 = GTPU_ERROR_NO_SUCH_TUNNEL; + next0 = GTPU_INPUT_NEXT_DROP; + goto trace0; + } + + next0: + /* Manipulate gtpu header */ + if (PREDICT_FALSE((gtpu0->ver_flags & GTPU_E_S_PN_BIT) != 0)) + { + gtpu_hdr_len0 = sizeof(gtpu_header_t); + + /* Manipulate Sequence Number and N-PDU Number */ + /* TBD */ + + /* Manipulate Next Extension Header */ + /* TBD */ + } + else + { + gtpu_hdr_len0 = sizeof(gtpu_header_t) - 4; + } + + /* Pop gtpu header */ + vlib_buffer_advance (b0, gtpu_hdr_len0); + + next0 = t0->decap_next_index; + sw_if_index0 = t0->sw_if_index; + len0 = vlib_buffer_length_in_chain (vm, b0); + + /* Required to make the l2 tag push / pop code work on l2 subifs */ + if (PREDICT_TRUE(next0 == GTPU_INPUT_NEXT_L2_INPUT)) + vnet_update_l2_len (b0); + + /* Set packet input sw_if_index to unicast GTPU tunnel for learning */ + vnet_buffer(b0)->sw_if_index[VLIB_RX] = sw_if_index0; + sw_if_index0 = (mt0) ? mt0->sw_if_index : sw_if_index0; + + pkts_decapsulated ++; + stats_n_packets += 1; + stats_n_bytes += len0; + + /* Batch stats increment on the same gtpu tunnel so counter + is not incremented per packet */ + if (PREDICT_FALSE (sw_if_index0 != stats_sw_if_index)) + { + stats_n_packets -= 1; + stats_n_bytes -= len0; + if (stats_n_packets) + vlib_increment_combined_counter + (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, + thread_index, stats_sw_if_index, + stats_n_packets, stats_n_bytes); + stats_n_packets = 1; + stats_n_bytes = len0; + stats_sw_if_index = sw_if_index0; + } + + trace0: + b0->error = error0 ? node->errors[error0] : 0; + + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + gtpu_rx_trace_t *tr + = vlib_add_trace (vm, node, b0, sizeof (*tr)); + tr->next_index = next0; + tr->error = error0; + tr->tunnel_index = tunnel_index0; + tr->teid = clib_net_to_host_u32(gtpu0->teid); + } + + if (PREDICT_FALSE ((gtpu1->ver_flags & GTPU_VER_MASK) != GTPU_V1_VER)) + { + error1 = GTPU_ERROR_BAD_VER; + next1 = GTPU_INPUT_NEXT_DROP; + goto trace1; + } + + /* Manipulate packet 1 */ + if (is_ip4) { + key4_1.src = ip4_1->src_address.as_u32; + key4_1.teid = gtpu1->teid; + + /* Make sure GTPU tunnel exist according to packet SIP and teid + * SIP identify a GTPU path, and teid identify a tunnel in a given GTPU path */ + if (PREDICT_FALSE (key4_1.as_u64 != last_key4.as_u64)) + { + p1 = hash_get (gtm->gtpu4_tunnel_by_key, key4_1.as_u64); + if (PREDICT_FALSE (p1 == NULL)) + { + error1 = GTPU_ERROR_NO_SUCH_TUNNEL; + next1 = GTPU_INPUT_NEXT_DROP; + goto trace1; + } + last_key4.as_u64 = key4_1.as_u64; + tunnel_index1 = last_tunnel_index = p1[0]; + } + else + tunnel_index1 = last_tunnel_index; + t1 = pool_elt_at_index (gtm->tunnels, tunnel_index1); + + /* Validate GTPU tunnel encap-fib index agaist packet */ + if (PREDICT_FALSE (validate_gtpu_fib (b1, t1, is_ip4) == 0)) + { + error1 = GTPU_ERROR_NO_SUCH_TUNNEL; + next1 = GTPU_INPUT_NEXT_DROP; + goto trace1; + } + + /* Validate GTPU tunnel SIP against packet DIP */ + if (PREDICT_TRUE (ip4_1->dst_address.as_u32 == t1->src.ip4.as_u32)) + goto next1; /* valid packet */ + if (PREDICT_FALSE (ip4_address_is_multicast (&ip4_1->dst_address))) + { + key4_1.src = ip4_1->dst_address.as_u32; + key4_1.teid = gtpu1->teid; + /* Make sure mcast GTPU tunnel exist by packet DIP and teid */ + p1 = hash_get (gtm->gtpu4_tunnel_by_key, key4_1.as_u64); + if (PREDICT_TRUE (p1 != NULL)) + { + mt1 = pool_elt_at_index (gtm->tunnels, p1[0]); + goto next1; /* valid packet */ + } + } + error1 = GTPU_ERROR_NO_SUCH_TUNNEL; + next1 = GTPU_INPUT_NEXT_DROP; + goto trace1; + + } else /* !is_ip4 */ { + key6_1.src.as_u64[0] = ip6_1->src_address.as_u64[0]; + key6_1.src.as_u64[1] = ip6_1->src_address.as_u64[1]; + key6_1.teid = gtpu1->teid; + + /* Make sure GTPU tunnel exist according to packet SIP and teid + * SIP identify a GTPU path, and teid identify a tunnel in a given GTPU path */ + if (PREDICT_FALSE (memcmp(&key6_1, &last_key6, sizeof(last_key6)) != 0)) + { + p1 = hash_get_mem (gtm->gtpu6_tunnel_by_key, &key6_1); + + if (PREDICT_FALSE (p1 == NULL)) + { + error1 = GTPU_ERROR_NO_SUCH_TUNNEL; + next1 = GTPU_INPUT_NEXT_DROP; + goto trace1; + } + + clib_memcpy (&last_key6, &key6_1, sizeof(key6_1)); + tunnel_index1 = last_tunnel_index = p1[0]; + } + else + tunnel_index1 = last_tunnel_index; + t1 = pool_elt_at_index (gtm->tunnels, tunnel_index1); + + /* Validate GTPU tunnel encap-fib index agaist packet */ + if (PREDICT_FALSE (validate_gtpu_fib (b1, t1, is_ip4) == 0)) + { + error1 = GTPU_ERROR_NO_SUCH_TUNNEL; + next1 = GTPU_INPUT_NEXT_DROP; + goto trace1; + } + + /* Validate GTPU tunnel SIP against packet DIP */ + if (PREDICT_TRUE (ip6_address_is_equal (&ip6_1->dst_address, + &t1->src.ip6))) + goto next1; /* valid packet */ + if (PREDICT_FALSE (ip6_address_is_multicast (&ip6_1->dst_address))) + { + key6_1.src.as_u64[0] = ip6_1->dst_address.as_u64[0]; + key6_1.src.as_u64[1] = ip6_1->dst_address.as_u64[1]; + key6_1.teid = gtpu1->teid; + p1 = hash_get_mem (gtm->gtpu6_tunnel_by_key, &key6_1); + if (PREDICT_TRUE (p1 != NULL)) + { + mt1 = pool_elt_at_index (gtm->tunnels, p1[0]); + goto next1; /* valid packet */ + } + } + error1 = GTPU_ERROR_NO_SUCH_TUNNEL; + next1 = GTPU_INPUT_NEXT_DROP; + goto trace1; + } + + next1: + /* Manipulate gtpu header */ + if (PREDICT_FALSE((gtpu1->ver_flags & GTPU_E_S_PN_BIT) != 0)) + { + gtpu_hdr_len1 = sizeof(gtpu_header_t); + + /* Manipulate Sequence Number and N-PDU Number */ + /* TBD */ + + /* Manipulate Next Extension Header */ + /* TBD */ + } + else + { + gtpu_hdr_len1 = sizeof(gtpu_header_t) - 4; + } + + /* Pop gtpu header */ + vlib_buffer_advance (b1, gtpu_hdr_len1); + + next1 = t1->decap_next_index; + sw_if_index1 = t1->sw_if_index; + len1 = vlib_buffer_length_in_chain (vm, b1); + + /* Required to make the l2 tag push / pop code work on l2 subifs */ + if (PREDICT_TRUE(next1 == GTPU_INPUT_NEXT_L2_INPUT)) + vnet_update_l2_len (b1); + + /* Set packet input sw_if_index to unicast GTPU tunnel for learning */ + vnet_buffer(b1)->sw_if_index[VLIB_RX] = sw_if_index1; + sw_if_index1 = (mt1) ? mt1->sw_if_index : sw_if_index1; + + pkts_decapsulated ++; + stats_n_packets += 1; + stats_n_bytes += len1; + + /* Batch stats increment on the same gtpu tunnel so counter + is not incremented per packet */ + if (PREDICT_FALSE (sw_if_index1 != stats_sw_if_index)) + { + stats_n_packets -= 1; + stats_n_bytes -= len1; + if (stats_n_packets) + vlib_increment_combined_counter + (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, + thread_index, stats_sw_if_index, + stats_n_packets, stats_n_bytes); + stats_n_packets = 1; + stats_n_bytes = len1; + stats_sw_if_index = sw_if_index1; + } + + trace1: + b1->error = error1 ? node->errors[error1] : 0; + + if (PREDICT_FALSE(b1->flags & VLIB_BUFFER_IS_TRACED)) + { + gtpu_rx_trace_t *tr + = vlib_add_trace (vm, node, b1, sizeof (*tr)); + tr->next_index = next1; + tr->error = error1; + tr->tunnel_index = tunnel_index1; + tr->teid = clib_net_to_host_u32(gtpu1->teid); + } + + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, + to_next, n_left_to_next, + bi0, bi1, next0, next1); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t * b0; + u32 next0; + ip4_header_t * ip4_0; + ip6_header_t * ip6_0; + gtpu_header_t * gtpu0; + u32 gtpu_hdr_len0 = 0; + uword * p0; + u32 tunnel_index0; + gtpu_tunnel_t * t0, * mt0 = NULL; + gtpu4_tunnel_key_t key4_0; + gtpu6_tunnel_key_t key6_0; + u32 error0; + u32 sw_if_index0, len0; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + /* udp leaves current_data pointing at the gtpu header */ + gtpu0 = vlib_buffer_get_current (b0); + if (is_ip4) { + vlib_buffer_advance + (b0, -(word)(sizeof(udp_header_t)+sizeof(ip4_header_t))); + ip4_0 = vlib_buffer_get_current (b0); + } else { + vlib_buffer_advance + (b0, -(word)(sizeof(udp_header_t)+sizeof(ip6_header_t))); + ip6_0 = vlib_buffer_get_current (b0); + } + + /* pop (ip, udp) */ + if (is_ip4) { + vlib_buffer_advance + (b0, sizeof(*ip4_0)+sizeof(udp_header_t)); + } else { + vlib_buffer_advance + (b0, sizeof(*ip6_0)+sizeof(udp_header_t)); + } + + tunnel_index0 = ~0; + error0 = 0; + if (PREDICT_FALSE ((gtpu0->ver_flags & GTPU_VER_MASK) != GTPU_V1_VER)) + { + error0 = GTPU_ERROR_BAD_VER; + next0 = GTPU_INPUT_NEXT_DROP; + goto trace00; + } + + if (is_ip4) { + key4_0.src = ip4_0->src_address.as_u32; + key4_0.teid = gtpu0->teid; + + /* Make sure GTPU tunnel exist according to packet SIP and teid + * SIP identify a GTPU path, and teid identify a tunnel in a given GTPU path */ + if (PREDICT_FALSE (key4_0.as_u64 != last_key4.as_u64)) + { + p0 = hash_get (gtm->gtpu4_tunnel_by_key, key4_0.as_u64); + if (PREDICT_FALSE (p0 == NULL)) + { + error0 = GTPU_ERROR_NO_SUCH_TUNNEL; + next0 = GTPU_INPUT_NEXT_DROP; + goto trace00; + } + last_key4.as_u64 = key4_0.as_u64; + tunnel_index0 = last_tunnel_index = p0[0]; + } + else + tunnel_index0 = last_tunnel_index; + t0 = pool_elt_at_index (gtm->tunnels, tunnel_index0); + + /* Validate GTPU tunnel encap-fib index agaist packet */ + if (PREDICT_FALSE (validate_gtpu_fib (b0, t0, is_ip4) == 0)) + { + error0 = GTPU_ERROR_NO_SUCH_TUNNEL; + next0 = GTPU_INPUT_NEXT_DROP; + goto trace00; + } + + /* Validate GTPU tunnel SIP against packet DIP */ + if (PREDICT_TRUE (ip4_0->dst_address.as_u32 == t0->src.ip4.as_u32)) + goto next00; /* valid packet */ + if (PREDICT_FALSE (ip4_address_is_multicast (&ip4_0->dst_address))) + { + key4_0.src = ip4_0->dst_address.as_u32; + key4_0.teid = gtpu0->teid; + /* Make sure mcast GTPU tunnel exist by packet DIP and teid */ + p0 = hash_get (gtm->gtpu4_tunnel_by_key, key4_0.as_u64); + if (PREDICT_TRUE (p0 != NULL)) + { + mt0 = pool_elt_at_index (gtm->tunnels, p0[0]); + goto next00; /* valid packet */ + } + } + error0 = GTPU_ERROR_NO_SUCH_TUNNEL; + next0 = GTPU_INPUT_NEXT_DROP; + goto trace00; + + } else /* !is_ip4 */ { + key6_0.src.as_u64[0] = ip6_0->src_address.as_u64[0]; + key6_0.src.as_u64[1] = ip6_0->src_address.as_u64[1]; + key6_0.teid = gtpu0->teid; + + /* Make sure GTPU tunnel exist according to packet SIP and teid + * SIP identify a GTPU path, and teid identify a tunnel in a given GTPU path */ + if (PREDICT_FALSE (memcmp(&key6_0, &last_key6, sizeof(last_key6)) != 0)) + { + p0 = hash_get_mem (gtm->gtpu6_tunnel_by_key, &key6_0); + if (PREDICT_FALSE (p0 == NULL)) + { + error0 = GTPU_ERROR_NO_SUCH_TUNNEL; + next0 = GTPU_INPUT_NEXT_DROP; + goto trace00; + } + clib_memcpy (&last_key6, &key6_0, sizeof(key6_0)); + tunnel_index0 = last_tunnel_index = p0[0]; + } + else + tunnel_index0 = last_tunnel_index; + t0 = pool_elt_at_index (gtm->tunnels, tunnel_index0); + + /* Validate GTPU tunnel encap-fib index agaist packet */ + if (PREDICT_FALSE (validate_gtpu_fib (b0, t0, is_ip4) == 0)) + { + error0 = GTPU_ERROR_NO_SUCH_TUNNEL; + next0 = GTPU_INPUT_NEXT_DROP; + goto trace00; + } + + /* Validate GTPU tunnel SIP against packet DIP */ + if (PREDICT_TRUE (ip6_address_is_equal (&ip6_0->dst_address, + &t0->src.ip6))) + goto next00; /* valid packet */ + if (PREDICT_FALSE (ip6_address_is_multicast (&ip6_0->dst_address))) + { + key6_0.src.as_u64[0] = ip6_0->dst_address.as_u64[0]; + key6_0.src.as_u64[1] = ip6_0->dst_address.as_u64[1]; + key6_0.teid = gtpu0->teid; + p0 = hash_get_mem (gtm->gtpu6_tunnel_by_key, &key6_0); + if (PREDICT_TRUE (p0 != NULL)) + { + mt0 = pool_elt_at_index (gtm->tunnels, p0[0]); + goto next00; /* valid packet */ + } + } + error0 = GTPU_ERROR_NO_SUCH_TUNNEL; + next0 = GTPU_INPUT_NEXT_DROP; + goto trace00; + } + + next00: + /* Manipulate gtpu header */ + if (PREDICT_FALSE((gtpu0->ver_flags & GTPU_E_S_PN_BIT) != 0)) + { + gtpu_hdr_len0 = sizeof(gtpu_header_t); + + /* Manipulate Sequence Number and N-PDU Number */ + /* TBD */ + + /* Manipulate Next Extension Header */ + /* TBD */ + } + else + { + gtpu_hdr_len0 = sizeof(gtpu_header_t) - 4; + } + + /* Pop gtpu header */ + vlib_buffer_advance (b0, gtpu_hdr_len0); + + next0 = t0->decap_next_index; + sw_if_index0 = t0->sw_if_index; + len0 = vlib_buffer_length_in_chain (vm, b0); + + /* Required to make the l2 tag push / pop code work on l2 subifs */ + if (PREDICT_TRUE(next0 == GTPU_INPUT_NEXT_L2_INPUT)) + vnet_update_l2_len (b0); + + /* Set packet input sw_if_index to unicast GTPU tunnel for learning */ + vnet_buffer(b0)->sw_if_index[VLIB_RX] = sw_if_index0; + sw_if_index0 = (mt0) ? mt0->sw_if_index : sw_if_index0; + + pkts_decapsulated ++; + stats_n_packets += 1; + stats_n_bytes += len0; + + /* Batch stats increment on the same gtpu tunnel so counter + is not incremented per packet */ + if (PREDICT_FALSE (sw_if_index0 != stats_sw_if_index)) + { + stats_n_packets -= 1; + stats_n_bytes -= len0; + if (stats_n_packets) + vlib_increment_combined_counter + (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, + thread_index, stats_sw_if_index, + stats_n_packets, stats_n_bytes); + stats_n_packets = 1; + stats_n_bytes = len0; + stats_sw_if_index = sw_if_index0; + } + + trace00: + b0->error = error0 ? node->errors[error0] : 0; + + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + gtpu_rx_trace_t *tr + = vlib_add_trace (vm, node, b0, sizeof (*tr)); + tr->next_index = next0; + tr->error = error0; + tr->tunnel_index = tunnel_index0; + tr->teid = clib_net_to_host_u32(gtpu0->teid); + } + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + /* Do we still need this now that tunnel tx stats is kept? */ + vlib_node_increment_counter (vm, is_ip4? + gtpu4_input_node.index:gtpu6_input_node.index, + GTPU_ERROR_DECAPSULATED, + pkts_decapsulated); + + /* Increment any remaining batch stats */ + if (stats_n_packets) + { + vlib_increment_combined_counter + (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); + node->runtime_data[0] = stats_sw_if_index; + } + + return from_frame->n_vectors; +} + +static uword +gtpu4_input (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return gtpu_input(vm, node, from_frame, /* is_ip4 */ 1); +} + +static uword +gtpu6_input (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return gtpu_input(vm, node, from_frame, /* is_ip4 */ 0); +} + +static char * gtpu_error_strings[] = { +#define gtpu_error(n,s) s, +#include <gtpu/gtpu_error.def> +#undef gtpu_error +#undef _ +}; + +VLIB_REGISTER_NODE (gtpu4_input_node) = { + .function = gtpu4_input, + .name = "gtpu4-input", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + + .n_errors = GTPU_N_ERROR, + .error_strings = gtpu_error_strings, + + .n_next_nodes = GTPU_INPUT_N_NEXT, + .next_nodes = { +#define _(s,n) [GTPU_INPUT_NEXT_##s] = n, + foreach_gtpu_input_next +#undef _ + }, + +//temp .format_buffer = format_gtpu_header, + .format_trace = format_gtpu_rx_trace, + // $$$$ .unformat_buffer = unformat_gtpu_header, +}; + +VLIB_NODE_FUNCTION_MULTIARCH (gtpu4_input_node, gtpu4_input) + +VLIB_REGISTER_NODE (gtpu6_input_node) = { + .function = gtpu6_input, + .name = "gtpu6-input", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + + .n_errors = GTPU_N_ERROR, + .error_strings = gtpu_error_strings, + + .n_next_nodes = GTPU_INPUT_N_NEXT, + .next_nodes = { +#define _(s,n) [GTPU_INPUT_NEXT_##s] = n, + foreach_gtpu_input_next +#undef _ + }, + +//temp .format_buffer = format_gtpu_header, + .format_trace = format_gtpu_rx_trace, + // $$$$ .unformat_buffer = unformat_gtpu_header, +}; + +VLIB_NODE_FUNCTION_MULTIARCH (gtpu6_input_node, gtpu6_input) + + +typedef enum { + IP_GTPU_BYPASS_NEXT_DROP, + IP_GTPU_BYPASS_NEXT_GTPU, + IP_GTPU_BYPASS_N_NEXT, +} ip_vxan_bypass_next_t; + +always_inline uword +ip_gtpu_bypass_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame, + u32 is_ip4) +{ + gtpu_main_t * gtm = >pu_main; + u32 * from, * to_next, n_left_from, n_left_to_next, next_index; + vlib_node_runtime_t * error_node = vlib_node_get_runtime (vm, ip4_input_node.index); + ip4_address_t addr4; /* last IPv4 address matching a local VTEP address */ + ip6_address_t addr6; /* last IPv6 address matching a local VTEP address */ + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + if (node->flags & VLIB_NODE_FLAG_TRACE) + ip4_forward_next_trace (vm, node, frame, VLIB_TX); + + if (is_ip4) addr4.data_u32 = ~0; + else ip6_address_set_zero (&addr6); + + while (n_left_from > 0) + { + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next >= 2) + { + vlib_buffer_t * b0, * b1; + ip4_header_t * ip40, * ip41; + ip6_header_t * ip60, * ip61; + udp_header_t * udp0, * udp1; + u32 bi0, ip_len0, udp_len0, flags0, next0; + u32 bi1, ip_len1, udp_len1, flags1, next1; + i32 len_diff0, len_diff1; + u8 error0, good_udp0, proto0; + u8 error1, good_udp1, proto1; + + /* Prefetch next iteration. */ + { + vlib_buffer_t * p2, * p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + + CLIB_PREFETCH (p2->data, 2*CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (p3->data, 2*CLIB_CACHE_LINE_BYTES, LOAD); + } + + bi0 = to_next[0] = from[0]; + bi1 = to_next[1] = from[1]; + from += 2; + n_left_from -= 2; + to_next += 2; + n_left_to_next -= 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + if (is_ip4) + { + ip40 = vlib_buffer_get_current (b0); + ip41 = vlib_buffer_get_current (b1); + } + else + { + ip60 = vlib_buffer_get_current (b0); + ip61 = vlib_buffer_get_current (b1); + } + + /* Setup packet for next IP feature */ + vnet_feature_next(vnet_buffer(b0)->sw_if_index[VLIB_RX], &next0, b0); + vnet_feature_next(vnet_buffer(b1)->sw_if_index[VLIB_RX], &next1, b1); + + if (is_ip4) + { + /* Treat IP frag packets as "experimental" protocol for now + until support of IP frag reassembly is implemented */ + proto0 = ip4_is_fragment(ip40) ? 0xfe : ip40->protocol; + proto1 = ip4_is_fragment(ip41) ? 0xfe : ip41->protocol; + } + else + { + proto0 = ip60->protocol; + proto1 = ip61->protocol; + } + + /* Process packet 0 */ + if (proto0 != IP_PROTOCOL_UDP) + goto exit0; /* not UDP packet */ + + if (is_ip4) + udp0 = ip4_next_header (ip40); + else + udp0 = ip6_next_header (ip60); + + if (udp0->dst_port != clib_host_to_net_u16 (UDP_DST_PORT_GTPU)) + goto exit0; /* not GTPU packet */ + + /* Validate DIP against VTEPs*/ + if (is_ip4) + { + if (addr4.as_u32 != ip40->dst_address.as_u32) + { + if (!hash_get (gtm->vtep4, ip40->dst_address.as_u32)) + goto exit0; /* no local VTEP for GTPU packet */ + addr4 = ip40->dst_address; + } + } + else + { + if (!ip6_address_is_equal (&addr6, &ip60->dst_address)) + { + if (!hash_get_mem (gtm->vtep6, &ip60->dst_address)) + goto exit0; /* no local VTEP for GTPU packet */ + addr6 = ip60->dst_address; + } + } + + flags0 = b0->flags; + good_udp0 = (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; + + /* Don't verify UDP checksum for packets with explicit zero checksum. */ + good_udp0 |= udp0->checksum == 0; + + /* Verify UDP length */ + if (is_ip4) + ip_len0 = clib_net_to_host_u16 (ip40->length); + else + ip_len0 = clib_net_to_host_u16 (ip60->payload_length); + udp_len0 = clib_net_to_host_u16 (udp0->length); + len_diff0 = ip_len0 - udp_len0; + + /* Verify UDP checksum */ + if (PREDICT_FALSE (!good_udp0)) + { + if ((flags0 & VNET_BUFFER_F_L4_CHECKSUM_COMPUTED) == 0) + { + if (is_ip4) + flags0 = ip4_tcp_udp_validate_checksum (vm, b0); + else + flags0 = ip6_tcp_udp_icmp_validate_checksum (vm, b0); + good_udp0 = + (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; + } + } + + if (is_ip4) + { + error0 = good_udp0 ? 0 : IP4_ERROR_UDP_CHECKSUM; + error0 = (len_diff0 >= 0) ? error0 : IP4_ERROR_UDP_LENGTH; + } + else + { + error0 = good_udp0 ? 0 : IP6_ERROR_UDP_CHECKSUM; + error0 = (len_diff0 >= 0) ? error0 : IP6_ERROR_UDP_LENGTH; + } + + next0 = error0 ? + IP_GTPU_BYPASS_NEXT_DROP : IP_GTPU_BYPASS_NEXT_GTPU; + b0->error = error0 ? error_node->errors[error0] : 0; + + /* gtpu-input node expect current at GTPU header */ + if (is_ip4) + vlib_buffer_advance (b0, sizeof(ip4_header_t)+sizeof(udp_header_t)); + else + vlib_buffer_advance (b0, sizeof(ip6_header_t)+sizeof(udp_header_t)); + + exit0: + /* Process packet 1 */ + if (proto1 != IP_PROTOCOL_UDP) + goto exit1; /* not UDP packet */ + + if (is_ip4) + udp1 = ip4_next_header (ip41); + else + udp1 = ip6_next_header (ip61); + + if (udp1->dst_port != clib_host_to_net_u16 (UDP_DST_PORT_GTPU)) + goto exit1; /* not GTPU packet */ + + /* Validate DIP against VTEPs*/ + if (is_ip4) + { + if (addr4.as_u32 != ip41->dst_address.as_u32) + { + if (!hash_get (gtm->vtep4, ip41->dst_address.as_u32)) + goto exit1; /* no local VTEP for GTPU packet */ + addr4 = ip41->dst_address; + } + } + else + { + if (!ip6_address_is_equal (&addr6, &ip61->dst_address)) + { + if (!hash_get_mem (gtm->vtep6, &ip61->dst_address)) + goto exit1; /* no local VTEP for GTPU packet */ + addr6 = ip61->dst_address; + } + } + + flags1 = b1->flags; + good_udp1 = (flags1 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; + + /* Don't verify UDP checksum for packets with explicit zero checksum. */ + good_udp1 |= udp1->checksum == 0; + + /* Verify UDP length */ + if (is_ip4) + ip_len1 = clib_net_to_host_u16 (ip41->length); + else + ip_len1 = clib_net_to_host_u16 (ip61->payload_length); + udp_len1 = clib_net_to_host_u16 (udp1->length); + len_diff1 = ip_len1 - udp_len1; + + /* Verify UDP checksum */ + if (PREDICT_FALSE (!good_udp1)) + { + if ((flags1 & VNET_BUFFER_F_L4_CHECKSUM_COMPUTED) == 0) + { + if (is_ip4) + flags1 = ip4_tcp_udp_validate_checksum (vm, b1); + else + flags1 = ip6_tcp_udp_icmp_validate_checksum (vm, b1); + good_udp1 = + (flags1 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; + } + } + + if (is_ip4) + { + error1 = good_udp1 ? 0 : IP4_ERROR_UDP_CHECKSUM; + error1 = (len_diff1 >= 0) ? error1 : IP4_ERROR_UDP_LENGTH; + } + else + { + error1 = good_udp1 ? 0 : IP6_ERROR_UDP_CHECKSUM; + error1 = (len_diff1 >= 0) ? error1 : IP6_ERROR_UDP_LENGTH; + } + + next1 = error1 ? + IP_GTPU_BYPASS_NEXT_DROP : IP_GTPU_BYPASS_NEXT_GTPU; + b1->error = error1 ? error_node->errors[error1] : 0; + + /* gtpu-input node expect current at GTPU header */ + if (is_ip4) + vlib_buffer_advance (b1, sizeof(ip4_header_t)+sizeof(udp_header_t)); + else + vlib_buffer_advance (b1, sizeof(ip6_header_t)+sizeof(udp_header_t)); + + exit1: + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, + to_next, n_left_to_next, + bi0, bi1, next0, next1); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + vlib_buffer_t * b0; + ip4_header_t * ip40; + ip6_header_t * ip60; + udp_header_t * udp0; + u32 bi0, ip_len0, udp_len0, flags0, next0; + i32 len_diff0; + u8 error0, good_udp0, proto0; + + bi0 = to_next[0] = from[0]; + from += 1; + n_left_from -= 1; + to_next += 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + if (is_ip4) + ip40 = vlib_buffer_get_current (b0); + else + ip60 = vlib_buffer_get_current (b0); + + /* Setup packet for next IP feature */ + vnet_feature_next(vnet_buffer(b0)->sw_if_index[VLIB_RX], &next0, b0); + + if (is_ip4) + /* Treat IP4 frag packets as "experimental" protocol for now + until support of IP frag reassembly is implemented */ + proto0 = ip4_is_fragment(ip40) ? 0xfe : ip40->protocol; + else + proto0 = ip60->protocol; + + if (proto0 != IP_PROTOCOL_UDP) + goto exit; /* not UDP packet */ + + if (is_ip4) + udp0 = ip4_next_header (ip40); + else + udp0 = ip6_next_header (ip60); + + if (udp0->dst_port != clib_host_to_net_u16 (UDP_DST_PORT_GTPU)) + goto exit; /* not GTPU packet */ + + /* Validate DIP against VTEPs*/ + if (is_ip4) + { + if (addr4.as_u32 != ip40->dst_address.as_u32) + { + if (!hash_get (gtm->vtep4, ip40->dst_address.as_u32)) + goto exit; /* no local VTEP for GTPU packet */ + addr4 = ip40->dst_address; + } + } + else + { + if (!ip6_address_is_equal (&addr6, &ip60->dst_address)) + { + if (!hash_get_mem (gtm->vtep6, &ip60->dst_address)) + goto exit; /* no local VTEP for GTPU packet */ + addr6 = ip60->dst_address; + } + } + + flags0 = b0->flags; + good_udp0 = (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; + + /* Don't verify UDP checksum for packets with explicit zero checksum. */ + good_udp0 |= udp0->checksum == 0; + + /* Verify UDP length */ + if (is_ip4) + ip_len0 = clib_net_to_host_u16 (ip40->length); + else + ip_len0 = clib_net_to_host_u16 (ip60->payload_length); + udp_len0 = clib_net_to_host_u16 (udp0->length); + len_diff0 = ip_len0 - udp_len0; + + /* Verify UDP checksum */ + if (PREDICT_FALSE (!good_udp0)) + { + if ((flags0 & VNET_BUFFER_F_L4_CHECKSUM_COMPUTED) == 0) + { + if (is_ip4) + flags0 = ip4_tcp_udp_validate_checksum (vm, b0); + else + flags0 = ip6_tcp_udp_icmp_validate_checksum (vm, b0); + good_udp0 = + (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; + } + } + + if (is_ip4) + { + error0 = good_udp0 ? 0 : IP4_ERROR_UDP_CHECKSUM; + error0 = (len_diff0 >= 0) ? error0 : IP4_ERROR_UDP_LENGTH; + } + else + { + error0 = good_udp0 ? 0 : IP6_ERROR_UDP_CHECKSUM; + error0 = (len_diff0 >= 0) ? error0 : IP6_ERROR_UDP_LENGTH; + } + + next0 = error0 ? + IP_GTPU_BYPASS_NEXT_DROP : IP_GTPU_BYPASS_NEXT_GTPU; + b0->error = error0 ? error_node->errors[error0] : 0; + + /* gtpu-input node expect current at GTPU header */ + if (is_ip4) + vlib_buffer_advance (b0, sizeof(ip4_header_t)+sizeof(udp_header_t)); + else + vlib_buffer_advance (b0, sizeof(ip6_header_t)+sizeof(udp_header_t)); + + exit: + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + return frame->n_vectors; +} + +static uword +ip4_gtpu_bypass (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return ip_gtpu_bypass_inline (vm, node, frame, /* is_ip4 */ 1); +} + +VLIB_REGISTER_NODE (ip4_gtpu_bypass_node) = { + .function = ip4_gtpu_bypass, + .name = "ip4-gtpu-bypass", + .vector_size = sizeof (u32), + + .n_next_nodes = IP_GTPU_BYPASS_N_NEXT, + .next_nodes = { + [IP_GTPU_BYPASS_NEXT_DROP] = "error-drop", + [IP_GTPU_BYPASS_NEXT_GTPU] = "gtpu4-input", + }, + + .format_buffer = format_ip4_header, + .format_trace = format_ip4_forward_next_trace, +}; + +VLIB_NODE_FUNCTION_MULTIARCH (ip4_gtpu_bypass_node,ip4_gtpu_bypass) + +/* Dummy init function to get us linked in. */ +clib_error_t * ip4_gtpu_bypass_init (vlib_main_t * vm) +{ return 0; } + +VLIB_INIT_FUNCTION (ip4_gtpu_bypass_init); + +static uword +ip6_gtpu_bypass (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return ip_gtpu_bypass_inline (vm, node, frame, /* is_ip4 */ 0); +} + +VLIB_REGISTER_NODE (ip6_gtpu_bypass_node) = { + .function = ip6_gtpu_bypass, + .name = "ip6-gtpu-bypass", + .vector_size = sizeof (u32), + + .n_next_nodes = IP_GTPU_BYPASS_N_NEXT, + .next_nodes = { + [IP_GTPU_BYPASS_NEXT_DROP] = "error-drop", + [IP_GTPU_BYPASS_NEXT_GTPU] = "gtpu6-input", + }, + + .format_buffer = format_ip6_header, + .format_trace = format_ip6_forward_next_trace, +}; + +VLIB_NODE_FUNCTION_MULTIARCH (ip6_gtpu_bypass_node,ip6_gtpu_bypass) + +/* Dummy init function to get us linked in. */ +clib_error_t * ip6_gtpu_bypass_init (vlib_main_t * vm) +{ return 0; } + +VLIB_INIT_FUNCTION (ip6_gtpu_bypass_init); diff --git a/src/plugins/gtpu/gtpu_encap.c b/src/plugins/gtpu/gtpu_encap.c new file mode 100644 index 00000000..8ad53c53 --- /dev/null +++ b/src/plugins/gtpu/gtpu_encap.c @@ -0,0 +1,705 @@ +/* + * Copyright (c) 2017 Intel and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vppinfra/error.h> +#include <vppinfra/hash.h> +#include <vnet/vnet.h> +#include <vnet/ip/ip.h> +#include <vnet/ethernet/ethernet.h> +#include <gtpu/gtpu.h> + +/* Statistics (not all errors) */ +#define foreach_gtpu_encap_error \ +_(ENCAPSULATED, "good packets encapsulated") + +static char * gtpu_encap_error_strings[] = { +#define _(sym,string) string, + foreach_gtpu_encap_error +#undef _ +}; + +typedef enum { +#define _(sym,str) GTPU_ENCAP_ERROR_##sym, + foreach_gtpu_encap_error +#undef _ + GTPU_ENCAP_N_ERROR, +} gtpu_encap_error_t; + +#define foreach_gtpu_encap_next \ +_(DROP, "error-drop") \ +_(IP4_LOOKUP, "ip4-lookup") \ +_(IP6_LOOKUP, "ip6-lookup") + +typedef enum { + GTPU_ENCAP_NEXT_DROP, + GTPU_ENCAP_NEXT_IP4_LOOKUP, + GTPU_ENCAP_NEXT_IP6_LOOKUP, + GTPU_ENCAP_N_NEXT, +} gtpu_encap_next_t; + +typedef struct { + u32 tunnel_index; + u32 teid; +} gtpu_encap_trace_t; + +u8 * format_gtpu_encap_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + gtpu_encap_trace_t * t + = va_arg (*args, gtpu_encap_trace_t *); + + s = format (s, "GTPU encap to gtpu_tunnel%d teid %d", + t->tunnel_index, t->teid); + return s; +} + + +#define foreach_fixed_header4_offset \ + _(0) _(1) _(2) _(3) + +#define foreach_fixed_header6_offset \ + _(0) _(1) _(2) _(3) _(4) _(5) _(6) + +always_inline uword +gtpu_encap_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame, + u32 is_ip4) +{ + u32 n_left_from, next_index, * from, * to_next; + gtpu_main_t * gtm = >pu_main; + vnet_main_t * vnm = gtm->vnet_main; + vnet_interface_main_t * im = &vnm->interface_main; + u32 pkts_encapsulated = 0; + u16 old_l0 = 0, old_l1 = 0, old_l2 = 0, old_l3 = 0; + u32 thread_index = vlib_get_thread_index(); + u32 stats_sw_if_index, stats_n_packets, stats_n_bytes; + u32 sw_if_index0 = 0, sw_if_index1 = 0, sw_if_index2 = 0, sw_if_index3 = 0; + u32 next0 = 0, next1 = 0, next2 = 0, next3 = 0; + vnet_hw_interface_t * hi0, * hi1, * hi2, * hi3; + gtpu_tunnel_t * t0 = NULL, * t1 = NULL, * t2 = NULL, * t3 = NULL; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + stats_sw_if_index = node->runtime_data[0]; + stats_n_packets = stats_n_bytes = 0; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, + to_next, n_left_to_next); + + while (n_left_from >= 8 && n_left_to_next >= 4) + { + u32 bi0, bi1, bi2, bi3; + vlib_buffer_t * b0, * b1, * b2, * b3; + u32 flow_hash0, flow_hash1, flow_hash2, flow_hash3; + u32 len0, len1, len2, len3; + ip4_header_t * ip4_0, * ip4_1, * ip4_2, * ip4_3; + ip6_header_t * ip6_0, * ip6_1, * ip6_2, * ip6_3; + udp_header_t * udp0, * udp1, * udp2, * udp3; + gtpu_header_t * gtpu0, * gtpu1, * gtpu2, * gtpu3; + u64 * copy_src0, * copy_dst0; + u64 * copy_src1, * copy_dst1; + u64 * copy_src2, * copy_dst2; + u64 * copy_src3, * copy_dst3; + u32 * copy_src_last0, * copy_dst_last0; + u32 * copy_src_last1, * copy_dst_last1; + u32 * copy_src_last2, * copy_dst_last2; + u32 * copy_src_last3, * copy_dst_last3; + u16 new_l0, new_l1, new_l2, new_l3; + ip_csum_t sum0, sum1, sum2, sum3; + + /* Prefetch next iteration. */ + { + vlib_buffer_t * p4, * p5, * p6, * p7; + + p4 = vlib_get_buffer (vm, from[4]); + p5 = vlib_get_buffer (vm, from[5]); + p6 = vlib_get_buffer (vm, from[6]); + p7 = vlib_get_buffer (vm, from[7]); + + vlib_prefetch_buffer_header (p4, LOAD); + vlib_prefetch_buffer_header (p5, LOAD); + vlib_prefetch_buffer_header (p6, LOAD); + vlib_prefetch_buffer_header (p7, LOAD); + + CLIB_PREFETCH (p4->data, 2*CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (p5->data, 2*CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (p6->data, 2*CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (p7->data, 2*CLIB_CACHE_LINE_BYTES, LOAD); + } + + bi0 = from[0]; + bi1 = from[1]; + bi2 = from[2]; + bi3 = from[3]; + to_next[0] = bi0; + to_next[1] = bi1; + to_next[2] = bi2; + to_next[3] = bi3; + from += 4; + to_next += 4; + n_left_to_next -= 4; + n_left_from -= 4; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + b2 = vlib_get_buffer (vm, bi2); + b3 = vlib_get_buffer (vm, bi3); + + flow_hash0 = vnet_l2_compute_flow_hash (b0); + flow_hash1 = vnet_l2_compute_flow_hash (b1); + flow_hash2 = vnet_l2_compute_flow_hash (b2); + flow_hash3 = vnet_l2_compute_flow_hash (b3); + + /* Get next node index and adj index from tunnel next_dpo */ + sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_TX]; + sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_TX]; + sw_if_index2 = vnet_buffer(b2)->sw_if_index[VLIB_TX]; + sw_if_index3 = vnet_buffer(b3)->sw_if_index[VLIB_TX]; + hi0 = vnet_get_sup_hw_interface (vnm, sw_if_index0); + hi1 = vnet_get_sup_hw_interface (vnm, sw_if_index1); + hi2 = vnet_get_sup_hw_interface (vnm, sw_if_index2); + hi3 = vnet_get_sup_hw_interface (vnm, sw_if_index3); + t0 = >m->tunnels[hi0->dev_instance]; + t1 = >m->tunnels[hi1->dev_instance]; + t2 = >m->tunnels[hi2->dev_instance]; + t3 = >m->tunnels[hi3->dev_instance]; + + /* Note: change to always set next0 if it may be set to drop */ + next0 = t0->next_dpo.dpoi_next_node; + vnet_buffer(b0)->ip.adj_index[VLIB_TX] = t0->next_dpo.dpoi_index; + next1 = t1->next_dpo.dpoi_next_node; + vnet_buffer(b1)->ip.adj_index[VLIB_TX] = t1->next_dpo.dpoi_index; + next2 = t2->next_dpo.dpoi_next_node; + vnet_buffer(b2)->ip.adj_index[VLIB_TX] = t2->next_dpo.dpoi_index; + next3 = t3->next_dpo.dpoi_next_node; + vnet_buffer(b3)->ip.adj_index[VLIB_TX] = t3->next_dpo.dpoi_index; + + /* Apply the rewrite string. $$$$ vnet_rewrite? */ + vlib_buffer_advance (b0, -(word)_vec_len(t0->rewrite)); + vlib_buffer_advance (b1, -(word)_vec_len(t1->rewrite)); + vlib_buffer_advance (b2, -(word)_vec_len(t2->rewrite)); + vlib_buffer_advance (b3, -(word)_vec_len(t3->rewrite)); + + if (is_ip4) + { + ip4_0 = vlib_buffer_get_current(b0); + ip4_1 = vlib_buffer_get_current(b1); + ip4_2 = vlib_buffer_get_current(b2); + ip4_3 = vlib_buffer_get_current(b3); + + /* Copy the fixed header */ + copy_dst0 = (u64 *) ip4_0; + copy_src0 = (u64 *) t0->rewrite; + copy_dst1 = (u64 *) ip4_1; + copy_src1 = (u64 *) t1->rewrite; + copy_dst2 = (u64 *) ip4_2; + copy_src2 = (u64 *) t2->rewrite; + copy_dst3 = (u64 *) ip4_3; + copy_src3 = (u64 *) t3->rewrite; + + /* Copy first 32 octets 8-bytes at a time */ +#define _(offs) copy_dst0[offs] = copy_src0[offs]; + foreach_fixed_header4_offset; +#undef _ +#define _(offs) copy_dst1[offs] = copy_src1[offs]; + foreach_fixed_header4_offset; +#undef _ +#define _(offs) copy_dst2[offs] = copy_src2[offs]; + foreach_fixed_header4_offset; +#undef _ +#define _(offs) copy_dst3[offs] = copy_src3[offs]; + foreach_fixed_header4_offset; +#undef _ + /* Last 4 octets. Hopefully gcc will be our friend */ + copy_dst_last0 = (u32 *)(©_dst0[4]); + copy_src_last0 = (u32 *)(©_src0[4]); + copy_dst_last0[0] = copy_src_last0[0]; + copy_dst_last1 = (u32 *)(©_dst1[4]); + copy_src_last1 = (u32 *)(©_src1[4]); + copy_dst_last1[0] = copy_src_last1[0]; + copy_dst_last2 = (u32 *)(©_dst2[4]); + copy_src_last2 = (u32 *)(©_src2[4]); + copy_dst_last2[0] = copy_src_last2[0]; + copy_dst_last3 = (u32 *)(©_dst3[4]); + copy_src_last3 = (u32 *)(©_src3[4]); + copy_dst_last3[0] = copy_src_last3[0]; + + /* Fix the IP4 checksum and length */ + sum0 = ip4_0->checksum; + new_l0 = /* old_l0 always 0, see the rewrite setup */ + clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0)); + sum0 = ip_csum_update (sum0, old_l0, new_l0, ip4_header_t, + length /* changed member */); + ip4_0->checksum = ip_csum_fold (sum0); + ip4_0->length = new_l0; + sum1 = ip4_1->checksum; + new_l1 = /* old_l1 always 0, see the rewrite setup */ + clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b1)); + sum1 = ip_csum_update (sum1, old_l1, new_l1, ip4_header_t, + length /* changed member */); + ip4_1->checksum = ip_csum_fold (sum1); + ip4_1->length = new_l1; + sum2 = ip4_2->checksum; + new_l2 = /* old_l0 always 0, see the rewrite setup */ + clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b2)); + sum2 = ip_csum_update (sum2, old_l2, new_l2, ip4_header_t, + length /* changed member */); + ip4_2->checksum = ip_csum_fold (sum2); + ip4_2->length = new_l2; + sum3 = ip4_3->checksum; + new_l3 = /* old_l1 always 0, see the rewrite setup */ + clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b3)); + sum3 = ip_csum_update (sum3, old_l3, new_l3, ip4_header_t, + length /* changed member */); + ip4_3->checksum = ip_csum_fold (sum3); + ip4_3->length = new_l3; + + /* Fix UDP length and set source port */ + udp0 = (udp_header_t *)(ip4_0+1); + new_l0 = clib_host_to_net_u16 (vlib_buffer_length_in_chain(vm, b0) + - sizeof (*ip4_0)); + udp0->length = new_l0; + udp0->src_port = flow_hash0; + udp1 = (udp_header_t *)(ip4_1+1); + new_l1 = clib_host_to_net_u16 (vlib_buffer_length_in_chain(vm, b1) + - sizeof (*ip4_1)); + udp1->length = new_l1; + udp1->src_port = flow_hash1; + udp2 = (udp_header_t *)(ip4_2+1); + new_l2 = clib_host_to_net_u16 (vlib_buffer_length_in_chain(vm, b2) + - sizeof (*ip4_2)); + udp2->length = new_l2; + udp2->src_port = flow_hash2; + udp3 = (udp_header_t *)(ip4_3+1); + new_l3 = clib_host_to_net_u16 (vlib_buffer_length_in_chain(vm, b3) + - sizeof (*ip4_3)); + udp3->length = new_l3; + udp3->src_port = flow_hash3; + + /* Fix GTPU length */ + gtpu0 = (gtpu_header_t *)(udp0+1); + new_l0 = clib_host_to_net_u16 (vlib_buffer_length_in_chain(vm, b0) + - sizeof (*ip4_0) - sizeof(*udp0)); + gtpu0->length = new_l0; + gtpu1 = (gtpu_header_t *)(udp1+1); + new_l1 = clib_host_to_net_u16 (vlib_buffer_length_in_chain(vm, b1) + - sizeof (*ip4_1) - sizeof(*udp1)); + gtpu1->length = new_l1; + gtpu2 = (gtpu_header_t *)(udp2+1); + new_l2 = clib_host_to_net_u16 (vlib_buffer_length_in_chain(vm, b2) + - sizeof (*ip4_2) - sizeof(*udp2)); + gtpu2->length = new_l2; + gtpu3 = (gtpu_header_t *)(udp1+3); + new_l3 = clib_host_to_net_u16 (vlib_buffer_length_in_chain(vm, b3) + - sizeof (*ip4_3) - sizeof(*udp3)); + gtpu3->length = new_l3; + } + else /* ipv6 */ + { + int bogus = 0; + + ip6_0 = vlib_buffer_get_current(b0); + ip6_1 = vlib_buffer_get_current(b1); + ip6_2 = vlib_buffer_get_current(b2); + ip6_3 = vlib_buffer_get_current(b3); + + /* Copy the fixed header */ + copy_dst0 = (u64 *) ip6_0; + copy_src0 = (u64 *) t0->rewrite; + copy_dst1 = (u64 *) ip6_1; + copy_src1 = (u64 *) t1->rewrite; + copy_dst2 = (u64 *) ip6_2; + copy_src2 = (u64 *) t2->rewrite; + copy_dst3 = (u64 *) ip6_3; + copy_src3 = (u64 *) t3->rewrite; + /* Copy first 56 (ip6) octets 8-bytes at a time */ +#define _(offs) copy_dst0[offs] = copy_src0[offs]; + foreach_fixed_header6_offset; +#undef _ +#define _(offs) copy_dst1[offs] = copy_src1[offs]; + foreach_fixed_header6_offset; +#undef _ +#define _(offs) copy_dst2[offs] = copy_src2[offs]; + foreach_fixed_header6_offset; +#undef _ +#define _(offs) copy_dst3[offs] = copy_src3[offs]; + foreach_fixed_header6_offset; +#undef _ + /* Fix IP6 payload length */ + new_l0 = + clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0) + - sizeof(*ip6_0)); + ip6_0->payload_length = new_l0; + new_l1 = + clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b1) + - sizeof(*ip6_1)); + ip6_1->payload_length = new_l1; + new_l2 = + clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b2) + - sizeof(*ip6_2)); + ip6_2->payload_length = new_l2; + new_l3 = + clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b3) + - sizeof(*ip6_3)); + ip6_3->payload_length = new_l3; + + /* Fix UDP length and set source port */ + udp0 = (udp_header_t *)(ip6_0+1); + udp0->length = new_l0; + udp0->src_port = flow_hash0; + udp1 = (udp_header_t *)(ip6_1+1); + udp1->length = new_l1; + udp1->src_port = flow_hash1; + udp2 = (udp_header_t *)(ip6_2+1); + udp2->length = new_l2; + udp2->src_port = flow_hash2; + udp3 = (udp_header_t *)(ip6_3+1); + udp3->length = new_l3; + udp3->src_port = flow_hash3; + + /* IPv6 UDP checksum is mandatory */ + udp0->checksum = ip6_tcp_udp_icmp_compute_checksum(vm, b0, + ip6_0, &bogus); + if (udp0->checksum == 0) + udp0->checksum = 0xffff; + udp1->checksum = ip6_tcp_udp_icmp_compute_checksum(vm, b1, + ip6_1, &bogus); + if (udp1->checksum == 0) + udp1->checksum = 0xffff; + udp2->checksum = ip6_tcp_udp_icmp_compute_checksum(vm, b2, + ip6_2, &bogus); + if (udp2->checksum == 0) + udp2->checksum = 0xffff; + udp3->checksum = ip6_tcp_udp_icmp_compute_checksum(vm, b3, + ip6_3, &bogus); + if (udp3->checksum == 0) + udp3->checksum = 0xffff; + + /* Fix GTPU length */ + gtpu0 = (gtpu_header_t *)(udp0+1); + new_l0 = clib_host_to_net_u16 (vlib_buffer_length_in_chain(vm, b0) + - sizeof (*ip4_0) - sizeof(*udp0)); + gtpu0->length = new_l0; + gtpu1 = (gtpu_header_t *)(udp1+1); + new_l1 = clib_host_to_net_u16 (vlib_buffer_length_in_chain(vm, b1) + - sizeof (*ip4_1) - sizeof(*udp1)); + gtpu1->length = new_l1; + gtpu2 = (gtpu_header_t *)(udp2+1); + new_l2 = clib_host_to_net_u16 (vlib_buffer_length_in_chain(vm, b2) + - sizeof (*ip4_2) - sizeof(*udp2)); + gtpu2->length = new_l2; + gtpu3 = (gtpu_header_t *)(udp3+1); + new_l3 = clib_host_to_net_u16 (vlib_buffer_length_in_chain(vm, b3) + - sizeof (*ip4_3) - sizeof(*udp3)); + gtpu3->length = new_l3; + } + + pkts_encapsulated += 4; + len0 = vlib_buffer_length_in_chain (vm, b0); + len1 = vlib_buffer_length_in_chain (vm, b1); + len2 = vlib_buffer_length_in_chain (vm, b2); + len3 = vlib_buffer_length_in_chain (vm, b3); + stats_n_packets += 4; + stats_n_bytes += len0 + len1 + len2 + len3; + + /* Batch stats increment on the same gtpu tunnel so counter is not + incremented per packet. Note stats are still incremented for deleted + and admin-down tunnel where packets are dropped. It is not worthwhile + to check for this rare case and affect normal path performance. */ + if (PREDICT_FALSE ((sw_if_index0 != stats_sw_if_index) || + (sw_if_index1 != stats_sw_if_index) || + (sw_if_index2 != stats_sw_if_index) || + (sw_if_index3 != stats_sw_if_index) )) + { + stats_n_packets -= 4; + stats_n_bytes -= len0 + len1 + len2 + len3; + if ( (sw_if_index0 == sw_if_index1 ) && + (sw_if_index1 == sw_if_index2 ) && + (sw_if_index2 == sw_if_index3 ) ) + { + if (stats_n_packets) + vlib_increment_combined_counter + (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, + thread_index, stats_sw_if_index, + stats_n_packets, stats_n_bytes); + stats_sw_if_index = sw_if_index0; + stats_n_packets = 4; + stats_n_bytes = len0 + len1 + len2 + len3; + } + else + { + vlib_increment_combined_counter + (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, + thread_index, sw_if_index0, 1, len0); + vlib_increment_combined_counter + (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, + thread_index, sw_if_index1, 1, len1); + vlib_increment_combined_counter + (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, + thread_index, sw_if_index2, 1, len2); + vlib_increment_combined_counter + (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, + thread_index, sw_if_index3, 1, len3); + } + } + + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + gtpu_encap_trace_t *tr = + vlib_add_trace (vm, node, b0, sizeof (*tr)); + tr->tunnel_index = t0 - gtm->tunnels; + tr->teid = t0->teid; + } + + if (PREDICT_FALSE(b1->flags & VLIB_BUFFER_IS_TRACED)) + { + gtpu_encap_trace_t *tr = + vlib_add_trace (vm, node, b1, sizeof (*tr)); + tr->tunnel_index = t1 - gtm->tunnels; + tr->teid = t1->teid; + } + + vlib_validate_buffer_enqueue_x4 (vm, node, next_index, + to_next, n_left_to_next, + bi0, bi1, bi2, bi3, + next0, next1, next2, next3); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t * b0; + u32 flow_hash0; + u32 len0; + ip4_header_t * ip4_0; + ip6_header_t * ip6_0; + udp_header_t * udp0; + gtpu_header_t * gtpu0; + u64 * copy_src0, * copy_dst0; + u32 * copy_src_last0, * copy_dst_last0; + u16 new_l0; + ip_csum_t sum0; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + flow_hash0 = vnet_l2_compute_flow_hash(b0); + + /* Get next node index and adj index from tunnel next_dpo */ + sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_TX]; + hi0 = vnet_get_sup_hw_interface (vnm, sw_if_index0); + t0 = >m->tunnels[hi0->dev_instance]; + /* Note: change to always set next0 if it may be set to drop */ + next0 = t0->next_dpo.dpoi_next_node; + vnet_buffer(b0)->ip.adj_index[VLIB_TX] = t0->next_dpo.dpoi_index; + + /* Apply the rewrite string. $$$$ vnet_rewrite? */ + vlib_buffer_advance (b0, -(word)_vec_len(t0->rewrite)); + + if (is_ip4) + { + ip4_0 = vlib_buffer_get_current(b0); + + /* Copy the fixed header */ + copy_dst0 = (u64 *) ip4_0; + copy_src0 = (u64 *) t0->rewrite; + /* Copy first 32 octets 8-bytes at a time */ +#define _(offs) copy_dst0[offs] = copy_src0[offs]; + foreach_fixed_header4_offset; +#undef _ + /* Last 4 octets. Hopefully gcc will be our friend */ + copy_dst_last0 = (u32 *)(©_dst0[4]); + copy_src_last0 = (u32 *)(©_src0[4]); + copy_dst_last0[0] = copy_src_last0[0]; + + /* Fix the IP4 checksum and length */ + sum0 = ip4_0->checksum; + new_l0 = /* old_l0 always 0, see the rewrite setup */ + clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0)); + sum0 = ip_csum_update (sum0, old_l0, new_l0, ip4_header_t, + length /* changed member */); + ip4_0->checksum = ip_csum_fold (sum0); + ip4_0->length = new_l0; + + /* Fix UDP length and set source port */ + udp0 = (udp_header_t *)(ip4_0+1); + new_l0 = clib_host_to_net_u16 (vlib_buffer_length_in_chain(vm, b0) + - sizeof (*ip4_0)); + udp0->length = new_l0; + udp0->src_port = flow_hash0; + + /* Fix GTPU length */ + gtpu0 = (gtpu_header_t *)(udp0+1); + new_l0 = clib_host_to_net_u16 (vlib_buffer_length_in_chain(vm, b0) + - sizeof (*ip4_0) - sizeof(*udp0)); + gtpu0->length = new_l0; + } + + else /* ip6 path */ + { + int bogus = 0; + + ip6_0 = vlib_buffer_get_current(b0); + /* Copy the fixed header */ + copy_dst0 = (u64 *) ip6_0; + copy_src0 = (u64 *) t0->rewrite; + /* Copy first 56 (ip6) octets 8-bytes at a time */ +#define _(offs) copy_dst0[offs] = copy_src0[offs]; + foreach_fixed_header6_offset; +#undef _ + /* Fix IP6 payload length */ + new_l0 = + clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0) + - sizeof(*ip6_0)); + ip6_0->payload_length = new_l0; + + /* Fix UDP length and set source port */ + udp0 = (udp_header_t *)(ip6_0+1); + udp0->length = new_l0; + udp0->src_port = flow_hash0; + + /* IPv6 UDP checksum is mandatory */ + udp0->checksum = ip6_tcp_udp_icmp_compute_checksum(vm, b0, + ip6_0, &bogus); + if (udp0->checksum == 0) + udp0->checksum = 0xffff; + + /* Fix GTPU length */ + gtpu0 = (gtpu_header_t *)(udp0+1); + new_l0 = clib_host_to_net_u16 (vlib_buffer_length_in_chain(vm, b0) + - sizeof (*ip4_0) - sizeof(*udp0)); + gtpu0->length = new_l0; + } + + pkts_encapsulated ++; + len0 = vlib_buffer_length_in_chain (vm, b0); + stats_n_packets += 1; + stats_n_bytes += len0; + + /* Batch stats increment on the same gtpu tunnel so counter is not + incremented per packet. Note stats are still incremented for deleted + and admin-down tunnel where packets are dropped. It is not worthwhile + to check for this rare case and affect normal path performance. */ + if (PREDICT_FALSE (sw_if_index0 != stats_sw_if_index)) + { + stats_n_packets -= 1; + stats_n_bytes -= len0; + if (stats_n_packets) + vlib_increment_combined_counter + (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, + thread_index, stats_sw_if_index, + stats_n_packets, stats_n_bytes); + stats_n_packets = 1; + stats_n_bytes = len0; + stats_sw_if_index = sw_if_index0; + } + + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + gtpu_encap_trace_t *tr = + vlib_add_trace (vm, node, b0, sizeof (*tr)); + tr->tunnel_index = t0 - gtm->tunnels; + tr->teid = t0->teid; + } + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + /* Do we still need this now that tunnel tx stats is kept? */ + vlib_node_increment_counter (vm, node->node_index, + GTPU_ENCAP_ERROR_ENCAPSULATED, + pkts_encapsulated); + + /* Increment any remaining batch stats */ + if (stats_n_packets) + { + vlib_increment_combined_counter + (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); + node->runtime_data[0] = stats_sw_if_index; + } + + return from_frame->n_vectors; +} + +static uword +gtpu4_encap (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return gtpu_encap_inline (vm, node, from_frame, /* is_ip4 */ 1); +} + +static uword +gtpu6_encap (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return gtpu_encap_inline (vm, node, from_frame, /* is_ip4 */ 0); +} + +VLIB_REGISTER_NODE (gtpu4_encap_node) = { + .function = gtpu4_encap, + .name = "gtpu4-encap", + .vector_size = sizeof (u32), + .format_trace = format_gtpu_encap_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN(gtpu_encap_error_strings), + .error_strings = gtpu_encap_error_strings, + .n_next_nodes = GTPU_ENCAP_N_NEXT, + .next_nodes = { +#define _(s,n) [GTPU_ENCAP_NEXT_##s] = n, + foreach_gtpu_encap_next +#undef _ + }, +}; + +VLIB_NODE_FUNCTION_MULTIARCH (gtpu4_encap_node, gtpu4_encap) + +VLIB_REGISTER_NODE (gtpu6_encap_node) = { + .function = gtpu6_encap, + .name = "gtpu6-encap", + .vector_size = sizeof (u32), + .format_trace = format_gtpu_encap_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN(gtpu_encap_error_strings), + .error_strings = gtpu_encap_error_strings, + .n_next_nodes = GTPU_ENCAP_N_NEXT, + .next_nodes = { +#define _(s,n) [GTPU_ENCAP_NEXT_##s] = n, + foreach_gtpu_encap_next +#undef _ + }, +}; + +VLIB_NODE_FUNCTION_MULTIARCH (gtpu6_encap_node, gtpu6_encap) + diff --git a/src/plugins/gtpu/gtpu_error.def b/src/plugins/gtpu/gtpu_error.def new file mode 100644 index 00000000..093a886f --- /dev/null +++ b/src/plugins/gtpu/gtpu_error.def @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2017 Intel and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +gtpu_error (DECAPSULATED, "good packets decapsulated") +gtpu_error (NO_SUCH_TUNNEL, "no such tunnel packets") +gtpu_error (BAD_VER, "packets with bad version in gtpu header") +gtpu_error (BAD_FLAGS, "packets with bad flags field in gtpu header") diff --git a/src/plugins/gtpu/gtpu_msg_enum.h b/src/plugins/gtpu/gtpu_msg_enum.h new file mode 100644 index 00000000..358a220a --- /dev/null +++ b/src/plugins/gtpu/gtpu_msg_enum.h @@ -0,0 +1,31 @@ +/* + * gtpu_msg_enum.h - vpp engine plug-in message enumeration + * + * Copyright (c) 2017 Intel and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef included_gtpu_msg_enum_h +#define included_gtpu_msg_enum_h + +#include <vppinfra/byte_order.h> + +#define vl_msg_id(n,h) n, +typedef enum +{ +#include <gtpu/gtpu_all_api_h.h> + /* We'll want to know how many messages IDs we need... */ + VL_MSG_FIRST_AVAILABLE, +} vl_msg_id_t; +#undef vl_msg_id + +#endif /* included_gtpu_msg_enum_h */ diff --git a/src/plugins/gtpu/gtpu_test.c b/src/plugins/gtpu/gtpu_test.c new file mode 100644 index 00000000..e7fd0d54 --- /dev/null +++ b/src/plugins/gtpu/gtpu_test.c @@ -0,0 +1,498 @@ +/* + * Copyright (c) 2017 Intel and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vat/vat.h> +#include <vlibapi/api.h> +#include <vlibmemory/api.h> +#include <vlibsocket/api.h> +#include <vppinfra/error.h> +#include <gtpu/gtpu.h> + +#define __plugin_msg_base gtpu_test_main.msg_id_base +#include <vlibapi/vat_helper_macros.h> + + +uword unformat_ip46_address (unformat_input_t * input, va_list * args) +{ + ip46_address_t *ip46 = va_arg (*args, ip46_address_t *); + ip46_type_t type = va_arg (*args, ip46_type_t); + if ((type != IP46_TYPE_IP6) && + unformat(input, "%U", unformat_ip4_address, &ip46->ip4)) { + ip46_address_mask_ip4(ip46); + return 1; + } else if ((type != IP46_TYPE_IP4) && + unformat(input, "%U", unformat_ip6_address, &ip46->ip6)) { + return 1; + } + return 0; +} +uword unformat_ip46_prefix (unformat_input_t * input, va_list * args) +{ + ip46_address_t *ip46 = va_arg (*args, ip46_address_t *); + u8 *len = va_arg (*args, u8 *); + ip46_type_t type = va_arg (*args, ip46_type_t); + + u32 l; + if ((type != IP46_TYPE_IP6) && unformat(input, "%U/%u", unformat_ip4_address, &ip46->ip4, &l)) { + if (l > 32) + return 0; + *len = l + 96; + ip46->pad[0] = ip46->pad[1] = ip46->pad[2] = 0; + } else if ((type != IP46_TYPE_IP4) && unformat(input, "%U/%u", unformat_ip6_address, &ip46->ip6, &l)) { + if (l > 128) + return 0; + *len = l; + } else { + return 0; + } + return 1; +} +///////////////////////// + +#define vl_msg_id(n,h) n, +typedef enum { +#include <gtpu/gtpu.api.h> + /* We'll want to know how many messages IDs we need... */ + VL_MSG_FIRST_AVAILABLE, +} vl_msg_id_t; +#undef vl_msg_id + +/* define message structures */ +#define vl_typedefs +#include <gtpu/gtpu.api.h> +#undef vl_typedefs + +/* declare message handlers for each api */ + +#define vl_endianfun /* define message structures */ +#include <gtpu/gtpu.api.h> +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) +#define vl_printfun +#include <gtpu/gtpu.api.h> +#undef vl_printfun + +/* Get the API version number. */ +#define vl_api_version(n,v) static u32 api_version=(v); +#include <gtpu/gtpu.api.h> +#undef vl_api_version + +typedef struct { + /* API message ID base */ + u16 msg_id_base; + vat_main_t *vat_main; +} gtpu_test_main_t; + +gtpu_test_main_t gtpu_test_main; + +static void vl_api_gtpu_add_del_tunnel_reply_t_handler + (vl_api_gtpu_add_del_tunnel_reply_t * mp) +{ + vat_main_t *vam = &vat_main; + i32 retval = ntohl (mp->retval); + if (vam->async_mode) + { + vam->async_errors += (retval < 0); + } + else + { + vam->retval = retval; + vam->sw_if_index = ntohl (mp->sw_if_index); + vam->result_ready = 1; + } +} + + +#define foreach_standard_reply_retval_handler \ + _(sw_interface_set_gtpu_bypass_reply) + +#define _(n) \ + static void vl_api_##n##_t_handler \ + (vl_api_##n##_t * mp) \ + { \ + vat_main_t * vam = gtpu_test_main.vat_main; \ + i32 retval = ntohl(mp->retval); \ + if (vam->async_mode) { \ + vam->async_errors += (retval < 0); \ + } else { \ + vam->retval = retval; \ + vam->result_ready = 1; \ + } \ + } + foreach_standard_reply_retval_handler; +#undef _ + +/* + * Table of message reply handlers, must include boilerplate handlers + * we just generated + */ +#define foreach_vpe_api_reply_msg \ + _(SW_INTERFACE_SET_GTPU_BYPASS_REPLY, sw_interface_set_gtpu_bypass_reply) \ + _(GTPU_ADD_DEL_TUNNEL_REPLY, gtpu_add_del_tunnel_reply) \ + _(GTPU_TUNNEL_DETAILS, gtpu_tunnel_details) + + +static uword +api_unformat_sw_if_index (unformat_input_t * input, va_list * args) +{ + vat_main_t *vam = va_arg (*args, vat_main_t *); + u32 *result = va_arg (*args, u32 *); + u8 *if_name; + uword *p; + + if (!unformat (input, "%s", &if_name)) + return 0; + + p = hash_get_mem (vam->sw_if_index_by_interface_name, if_name); + if (p == 0) + return 0; + *result = p[0]; + return 1; +} + +static int +api_sw_interface_set_gtpu_bypass (vat_main_t * vam) +{ + unformat_input_t *i = vam->input; + vl_api_sw_interface_set_gtpu_bypass_t *mp; + u32 sw_if_index = 0; + u8 sw_if_index_set = 0; + u8 is_enable = 1; + u8 is_ipv6 = 0; + int ret; + + /* Parse args required to build the message */ + while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT) + { + if (unformat (i, "%U", api_unformat_sw_if_index, vam, &sw_if_index)) + sw_if_index_set = 1; + else if (unformat (i, "sw_if_index %d", &sw_if_index)) + sw_if_index_set = 1; + else if (unformat (i, "enable")) + is_enable = 1; + else if (unformat (i, "disable")) + is_enable = 0; + else if (unformat (i, "ip4")) + is_ipv6 = 0; + else if (unformat (i, "ip6")) + is_ipv6 = 1; + else + break; + } + + if (sw_if_index_set == 0) + { + errmsg ("missing interface name or sw_if_index"); + return -99; + } + + /* Construct the API message */ + M (SW_INTERFACE_SET_GTPU_BYPASS, mp); + + mp->sw_if_index = ntohl (sw_if_index); + mp->enable = is_enable; + mp->is_ipv6 = is_ipv6; + + /* send it... */ + S (mp); + + /* Wait for a reply... */ + W (ret); + return ret; +} + +static uword unformat_gtpu_decap_next + (unformat_input_t * input, va_list * args) +{ + u32 *result = va_arg (*args, u32 *); + u32 tmp; + + if (unformat (input, "l2")) + *result = GTPU_INPUT_NEXT_L2_INPUT; + else if (unformat (input, "%d", &tmp)) + *result = tmp; + else + return 0; + return 1; +} + +static int +api_gtpu_add_del_tunnel (vat_main_t * vam) +{ + unformat_input_t *line_input = vam->input; + vl_api_gtpu_add_del_tunnel_t *mp; + ip46_address_t src, dst; + u8 is_add = 1; + u8 ipv4_set = 0, ipv6_set = 0; + u8 src_set = 0; + u8 dst_set = 0; + u8 grp_set = 0; + u32 mcast_sw_if_index = ~0; + u32 encap_vrf_id = 0; + u32 decap_next_index = ~0; + u32 teid = 0; + int ret; + + /* Can't "universally zero init" (={0}) due to GCC bug 53119 */ + memset (&src, 0, sizeof src); + memset (&dst, 0, sizeof dst); + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "del")) + is_add = 0; + else + if (unformat (line_input, "src %U", unformat_ip4_address, &src.ip4)) + { + ipv4_set = 1; + src_set = 1; + } + else + if (unformat (line_input, "dst %U", unformat_ip4_address, &dst.ip4)) + { + ipv4_set = 1; + dst_set = 1; + } + else + if (unformat (line_input, "src %U", unformat_ip6_address, &src.ip6)) + { + ipv6_set = 1; + src_set = 1; + } + else + if (unformat (line_input, "dst %U", unformat_ip6_address, &dst.ip6)) + { + ipv6_set = 1; + dst_set = 1; + } + else if (unformat (line_input, "group %U %U", + unformat_ip4_address, &dst.ip4, + api_unformat_sw_if_index, vam, &mcast_sw_if_index)) + { + grp_set = dst_set = 1; + ipv4_set = 1; + } + else if (unformat (line_input, "group %U", + unformat_ip4_address, &dst.ip4)) + { + grp_set = dst_set = 1; + ipv4_set = 1; + } + else if (unformat (line_input, "group %U %U", + unformat_ip6_address, &dst.ip6, + api_unformat_sw_if_index, vam, &mcast_sw_if_index)) + { + grp_set = dst_set = 1; + ipv6_set = 1; + } + else if (unformat (line_input, "group %U", + unformat_ip6_address, &dst.ip6)) + { + grp_set = dst_set = 1; + ipv6_set = 1; + } + else + if (unformat (line_input, "mcast_sw_if_index %u", &mcast_sw_if_index)) + ; + else if (unformat (line_input, "encap-vrf-id %d", &encap_vrf_id)) + ; + else if (unformat (line_input, "decap-next %U", + unformat_gtpu_decap_next, &decap_next_index)) + ; + else if (unformat (line_input, "teid %d", &teid)) + ; + else + { + errmsg ("parse error '%U'", format_unformat_error, line_input); + return -99; + } + } + + if (src_set == 0) + { + errmsg ("tunnel src address not specified"); + return -99; + } + if (dst_set == 0) + { + errmsg ("tunnel dst address not specified"); + return -99; + } + + if (grp_set && !ip46_address_is_multicast (&dst)) + { + errmsg ("tunnel group address not multicast"); + return -99; + } + if (grp_set && mcast_sw_if_index == ~0) + { + errmsg ("tunnel nonexistent multicast device"); + return -99; + } + if (grp_set == 0 && ip46_address_is_multicast (&dst)) + { + errmsg ("tunnel dst address must be unicast"); + return -99; + } + + + if (ipv4_set && ipv6_set) + { + errmsg ("both IPv4 and IPv6 addresses specified"); + return -99; + } + + M (GTPU_ADD_DEL_TUNNEL, mp); + + if (ipv6_set) + { + clib_memcpy (mp->src_address, &src.ip6, sizeof (src.ip6)); + clib_memcpy (mp->dst_address, &dst.ip6, sizeof (dst.ip6)); + } + else + { + clib_memcpy (mp->src_address, &src.ip4, sizeof (src.ip4)); + clib_memcpy (mp->dst_address, &dst.ip4, sizeof (dst.ip4)); + } + mp->encap_vrf_id = ntohl (encap_vrf_id); + mp->decap_next_index = ntohl (decap_next_index); + mp->mcast_sw_if_index = ntohl (mcast_sw_if_index); + mp->teid = ntohl (teid); + mp->is_add = is_add; + mp->is_ipv6 = ipv6_set; + + S (mp); + W (ret); + return ret; +} + +static void vl_api_gtpu_tunnel_details_t_handler + (vl_api_gtpu_tunnel_details_t * mp) +{ + vat_main_t *vam = &vat_main; + ip46_address_t src = to_ip46 (mp->is_ipv6, mp->dst_address); + ip46_address_t dst = to_ip46 (mp->is_ipv6, mp->src_address); + + print (vam->ofp, "%11d%24U%24U%14d%18d%13d%19d", + ntohl (mp->sw_if_index), + format_ip46_address, &src, IP46_TYPE_ANY, + format_ip46_address, &dst, IP46_TYPE_ANY, + ntohl (mp->encap_vrf_id), + ntohl (mp->decap_next_index), ntohl (mp->teid), + ntohl (mp->mcast_sw_if_index)); +} + +static int +api_gtpu_tunnel_dump (vat_main_t * vam) +{ + unformat_input_t *i = vam->input; + vl_api_gtpu_tunnel_dump_t *mp; + u32 sw_if_index; + u8 sw_if_index_set = 0; + int ret; + + /* Parse args required to build the message */ + while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT) + { + if (unformat (i, "sw_if_index %d", &sw_if_index)) + sw_if_index_set = 1; + else + break; + } + + if (sw_if_index_set == 0) + { + sw_if_index = ~0; + } + + if (!vam->json_output) + { + print (vam->ofp, "%11s%24s%24s%14s%18s%13s%19s", + "sw_if_index", "src_address", "dst_address", + "encap_vrf_id", "decap_next_index", "teid", "mcast_sw_if_index"); + } + + /* Get list of gtpu-tunnel interfaces */ + M (GTPU_TUNNEL_DUMP, mp); + + mp->sw_if_index = htonl (sw_if_index); + + S (mp); + + W (ret); + return ret; +} + +/* + * List of messages that the api test plugin sends, + * and that the data plane plugin processes + */ +#define foreach_vpe_api_msg \ +_(sw_interface_set_gtpu_bypass, \ + "<intfc> | sw_if_index <id> [ip4 | ip6] [enable | disable]") \ +_(gtpu_add_del_tunnel, \ + "src <ip-addr> { dst <ip-addr> | group <mcast-ip-addr>\n" \ + "{ <intfc> | mcast_sw_if_index <nn> } }\n" \ + "teid <teid> [encap-vrf-id <nn>] [decap-next <l2|nn>] [del]") \ +_(gtpu_tunnel_dump, "[<intfc> | sw_if_index <nn>]") \ + +static void +gtpu_vat_api_hookup (vat_main_t *vam) +{ + gtpu_test_main_t * gtm = >pu_test_main; + /* Hook up handlers for replies from the data plane plug-in */ +#define _(N,n) \ + vl_msg_api_set_handlers((VL_API_##N + gtm->msg_id_base), \ + #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_vpe_api_reply_msg; +#undef _ + + /* API messages we can send */ +#define _(n,h) hash_set_mem (vam->function_by_name, #n, api_##n); + foreach_vpe_api_msg; +#undef _ + + /* Help strings */ +#define _(n,h) hash_set_mem (vam->help_by_name, #n, h); + foreach_vpe_api_msg; +#undef _ +} + +clib_error_t * vat_plugin_register (vat_main_t *vam) +{ + gtpu_test_main_t * gtm = >pu_test_main; + + u8 * name; + + gtm->vat_main = vam; + + /* Ask the vpp engine for the first assigned message-id */ + name = format (0, "gtpu_%08x%c", api_version, 0); + gtm->msg_id_base = vl_client_get_first_plugin_msg_id ((char *) name); + + if (gtm->msg_id_base != (u16) ~0) + gtpu_vat_api_hookup (vam); + + vec_free(name); + + return 0; +} diff --git a/src/plugins/ila.am b/src/plugins/ila.am new file mode 100644 index 00000000..d900f3eb --- /dev/null +++ b/src/plugins/ila.am @@ -0,0 +1,20 @@ +# Copyright (c) 2016 Cisco Systems, Inc. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +vppplugins_LTLIBRARIES += ila_plugin.la + +ila_plugin_la_SOURCES = ila/ila.c + +noinst_HEADERS += ila/ila.h + +# vi:syntax=automake diff --git a/src/plugins/ila/ila.c b/src/plugins/ila/ila.c new file mode 100644 index 00000000..fd56043e --- /dev/null +++ b/src/plugins/ila/ila.c @@ -0,0 +1,1079 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <ila/ila.h> +#include <vnet/plugin/plugin.h> +#include <vnet/ip/lookup.h> +#include <vnet/dpo/dpo.h> +#include <vnet/fib/fib_table.h> +#include <vpp/app/version.h> + +static ila_main_t ila_main; + +#define ILA_TABLE_DEFAULT_HASH_NUM_BUCKETS (64 * 1024) +#define ILA_TABLE_DEFAULT_HASH_MEMORY_SIZE (32<<20) + +#define foreach_ila_error \ + _(NONE, "valid ILA packets") + +typedef enum { +#define _(sym,str) ILA_ERROR_##sym, + foreach_ila_error +#undef _ + ILA_N_ERROR, +} ila_error_t; + +static char *ila_error_strings[] = { +#define _(sym,string) string, + foreach_ila_error +#undef _ +}; + +typedef enum { + ILA_ILA2SIR_NEXT_DROP, + ILA_ILA2SIR_N_NEXT, +} ila_ila2sir_next_t; + +typedef struct { + u32 ila_index; + ip6_address_t initial_dst; + u32 adj_index; +} ila_ila2sir_trace_t; + +static ila_entry_t ila_sir2ila_default_entry = { + .csum_mode = ILA_CSUM_MODE_NO_ACTION, + .type = ILA_TYPE_IID, + .dir = ILA_DIR_ILA2SIR, //Will pass the packet with no +}; + +/** + * @brief Dynamically registered DPO Type for ILA + */ +static dpo_type_t ila_dpo_type; + +/** + * @brief Dynamically registered FIB node type for ILA + */ +static fib_node_type_t ila_fib_node_type; + +u8 * +format_half_ip6_address (u8 * s, va_list * va) +{ + u64 v = clib_net_to_host_u64 (va_arg (*va, u64)); + + return format (s, "%04x:%04x:%04x:%04x", + v >> 48, (v >> 32) & 0xffff, (v >> 16) & 0xffff, v & 0xffff); + +} + +u8 * +format_ila_direction (u8 * s, va_list * args) +{ + ila_direction_t t = va_arg (*args, ila_direction_t); +#define _(i,n,st) \ + if (t == ILA_DIR_##i) \ + return format(s, st); + ila_foreach_direction +#undef _ + return format (s, "invalid_ila_direction"); +} + +static u8 * +format_csum_mode (u8 * s, va_list * va) +{ + ila_csum_mode_t csum_mode = va_arg (*va, ila_csum_mode_t); + char *txt; + + switch (csum_mode) + { +#define _(i,n,st) \ + case ILA_CSUM_MODE_##i: \ + txt = st; \ + break; + ila_csum_foreach_type +#undef _ + default: + txt = "invalid_ila_csum_mode"; + break; + } + return format (s, txt); +} + +u8 * +format_ila_type (u8 * s, va_list * args) +{ + ila_type_t t = va_arg (*args, ila_type_t); +#define _(i,n,st) \ + if (t == ILA_TYPE_##i) \ + return format(s, st); + ila_foreach_type +#undef _ + return format (s, "invalid_ila_type"); +} + +static u8 * +format_ila_entry (u8 * s, va_list * va) +{ + vnet_main_t *vnm = va_arg (*va, vnet_main_t *); + ila_entry_t *e = va_arg (*va, ila_entry_t *); + + if (!e) + { + return format (s, "%-15s%=40s%=40s%+16s%+18s%+11s", "Type", "SIR Address", + "ILA Address", "Checksum Mode", "Direction", "Next DPO"); + } + else if (vnm) + { + if (ip6_address_is_zero(&e->next_hop)) + { + return format (s, "%-15U%=40U%=40U%18U%11U%s", + format_ila_type, e->type, + format_ip6_address, &e->sir_address, + format_ip6_address, &e->ila_address, + format_csum_mode, e->csum_mode, + format_ila_direction, e->dir, + "n/a"); + } + else + { + return format (s, "%-15U%=40U%=40U%18U%11U%U", + format_ila_type, e->type, + format_ip6_address, &e->sir_address, + format_ip6_address, &e->ila_address, + format_csum_mode, e->csum_mode, + format_ila_direction, e->dir, + format_dpo_id, &e->ila_dpo, 0); + } + } + + return NULL; +} + +u8 * +format_ila_ila2sir_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + ila_ila2sir_trace_t *t = va_arg (*args, ila_ila2sir_trace_t *); + return format (s, + "ILA -> SIR adj index: %d entry index: %d initial_dst: %U", + t->adj_index, t->ila_index, format_ip6_address, + &t->initial_dst); +} + +static uword +unformat_ila_direction (unformat_input_t * input, va_list * args) +{ + ila_direction_t *result = va_arg (*args, ila_direction_t *); +#define _(i,n,s) \ + if (unformat(input, s)) \ + { \ + *result = ILA_DIR_##i; \ + return 1;\ + } + + ila_foreach_direction +#undef _ + return 0; +} + +static uword +unformat_ila_type (unformat_input_t * input, va_list * args) +{ + ila_type_t *result = va_arg (*args, ila_type_t *); +#define _(i,n,s) \ + if (unformat(input, s)) \ + { \ + *result = ILA_TYPE_##i; \ + return 1;\ + } + + ila_foreach_type +#undef _ + return 0; +} + +static uword +unformat_ila_csum_mode (unformat_input_t * input, va_list * args) +{ + ila_csum_mode_t *result = va_arg (*args, ila_csum_mode_t *); + if (unformat (input, "none") || unformat (input, "no-action")) + { + *result = ILA_CSUM_MODE_NO_ACTION; + return 1; + } + if (unformat (input, "neutral-map")) + { + *result = ILA_CSUM_MODE_NEUTRAL_MAP; + return 1; + } + if (unformat (input, "adjust-transport")) + { + *result = ILA_CSUM_MODE_ADJUST_TRANSPORT; + return 1; + } + return 0; +} + +static uword +unformat_half_ip6_address (unformat_input_t * input, va_list * args) +{ + u64 *result = va_arg (*args, u64 *); + u32 a[4]; + + if (!unformat (input, "%x:%x:%x:%x", &a[0], &a[1], &a[2], &a[3])) + return 0; + + if (a[0] > 0xFFFF || a[1] > 0xFFFF || a[2] > 0xFFFF || a[3] > 0xFFFF) + return 0; + + *result = clib_host_to_net_u64 ((((u64) a[0]) << 48) | + (((u64) a[1]) << 32) | + (((u64) a[2]) << 16) | (((u64) a[3]))); + + return 1; +} + +static vlib_node_registration_t ila_ila2sir_node; + +static uword +ila_ila2sir (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + u32 n_left_from, *from, next_index, *to_next, n_left_to_next; + ila_main_t *ilm = &ila_main; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next >= 2) + { + u32 pi0, pi1; + vlib_buffer_t *p0, *p1; + ila_entry_t *ie0, *ie1; + ip6_header_t *ip60, *ip61; + ip6_address_t *sir_address0, *sir_address1; + + { + vlib_buffer_t *p2, *p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + CLIB_PREFETCH (p2->data, sizeof (ip6_header_t), LOAD); + CLIB_PREFETCH (p3->data, sizeof (ip6_header_t), LOAD); + } + + pi0 = to_next[0] = from[0]; + pi1 = to_next[1] = from[1]; + from += 2; + n_left_from -= 2; + to_next += 2; + n_left_to_next -= 2; + + p0 = vlib_get_buffer (vm, pi0); + p1 = vlib_get_buffer (vm, pi1); + ip60 = vlib_buffer_get_current (p0); + ip61 = vlib_buffer_get_current (p1); + sir_address0 = &ip60->dst_address; + sir_address1 = &ip61->dst_address; + ie0 = pool_elt_at_index (ilm->entries, + vnet_buffer (p0)->ip.adj_index[VLIB_TX]); + ie1 = pool_elt_at_index (ilm->entries, + vnet_buffer (p1)->ip.adj_index[VLIB_TX]); + + if (PREDICT_FALSE (p0->flags & VLIB_BUFFER_IS_TRACED)) + { + ila_ila2sir_trace_t *tr = + vlib_add_trace (vm, node, p0, sizeof (*tr)); + tr->ila_index = ie0 - ilm->entries; + tr->initial_dst = ip60->dst_address; + tr->adj_index = vnet_buffer (p0)->ip.adj_index[VLIB_TX]; + } + + if (PREDICT_FALSE (p1->flags & VLIB_BUFFER_IS_TRACED)) + { + ila_ila2sir_trace_t *tr = + vlib_add_trace (vm, node, p1, sizeof (*tr)); + tr->ila_index = ie1 - ilm->entries; + tr->initial_dst = ip61->dst_address; + tr->adj_index = vnet_buffer (p1)->ip.adj_index[VLIB_TX]; + } + + sir_address0 = (ie0->dir != ILA_DIR_SIR2ILA) ? &ie0->sir_address : sir_address0; + sir_address1 = (ie1->dir != ILA_DIR_SIR2ILA) ? &ie1->sir_address : sir_address1; + ip60->dst_address.as_u64[0] = sir_address0->as_u64[0]; + ip60->dst_address.as_u64[1] = sir_address0->as_u64[1]; + ip61->dst_address.as_u64[0] = sir_address1->as_u64[0]; + ip61->dst_address.as_u64[1] = sir_address1->as_u64[1]; + + vnet_buffer (p0)->ip.adj_index[VLIB_TX] = ie0->ila_dpo.dpoi_index; + vnet_buffer (p1)->ip.adj_index[VLIB_TX] = ie1->ila_dpo.dpoi_index; + + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, to_next, + n_left_to_next, pi0, pi1, + ie0->ila_dpo.dpoi_next_node, + ie1->ila_dpo.dpoi_next_node); + } + + /* Single loop */ + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 pi0; + vlib_buffer_t *p0; + ila_entry_t *ie0; + ip6_header_t *ip60; + ip6_address_t *sir_address0; + + pi0 = to_next[0] = from[0]; + from += 1; + n_left_from -= 1; + to_next += 1; + n_left_to_next -= 1; + + p0 = vlib_get_buffer (vm, pi0); + ip60 = vlib_buffer_get_current (p0); + sir_address0 = &ip60->dst_address; + ie0 = pool_elt_at_index (ilm->entries, + vnet_buffer (p0)->ip.adj_index[VLIB_TX]); + + if (PREDICT_FALSE (p0->flags & VLIB_BUFFER_IS_TRACED)) + { + ila_ila2sir_trace_t *tr = + vlib_add_trace (vm, node, p0, sizeof (*tr)); + tr->ila_index = ie0 ? (ie0 - ilm->entries) : ~0; + tr->initial_dst = ip60->dst_address; + tr->adj_index = vnet_buffer (p0)->ip.adj_index[VLIB_TX]; + } + + sir_address0 = (ie0->dir != ILA_DIR_SIR2ILA) ? &ie0->sir_address : sir_address0; + ip60->dst_address.as_u64[0] = sir_address0->as_u64[0]; + ip60->dst_address.as_u64[1] = sir_address0->as_u64[1]; + vnet_buffer (p0)->ip.adj_index[VLIB_TX] = ie0->ila_dpo.dpoi_index; + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, pi0, + ie0->ila_dpo.dpoi_next_node); + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + return frame->n_vectors; +} + +/** *INDENT-OFF* */ +VLIB_REGISTER_NODE (ila_ila2sir_node, static) = +{ + .function = ila_ila2sir, + .name = "ila-to-sir", + .vector_size = sizeof (u32), + .format_trace = format_ila_ila2sir_trace, + .n_errors = ILA_N_ERROR, + .error_strings = ila_error_strings, + .n_next_nodes = ILA_ILA2SIR_N_NEXT, + .next_nodes = + { + [ILA_ILA2SIR_NEXT_DROP] = "error-drop" + }, +}; +/** *INDENT-ON* */ + +typedef enum +{ + ILA_SIR2ILA_NEXT_DROP, + ILA_SIR2ILA_N_NEXT, +} ila_sir2ila_next_t; + +typedef struct +{ + u32 ila_index; + ip6_address_t initial_dst; +} ila_sir2ila_trace_t; + +u8 * +format_ila_sir2ila_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + ila_sir2ila_trace_t *t = va_arg (*args, ila_sir2ila_trace_t *); + + return format (s, "SIR -> ILA entry index: %d initial_dst: %U", + t->ila_index, format_ip6_address, &t->initial_dst); +} + +static vlib_node_registration_t ila_sir2ila_node; + +static uword +ila_sir2ila (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + u32 n_left_from, *from, next_index, *to_next, n_left_to_next; + ila_main_t *ilm = &ila_main; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next >= 2) + { + u32 pi0, pi1; + vlib_buffer_t *p0, *p1; + ip6_header_t *ip60, *ip61; + u32 next0 = ILA_SIR2ILA_NEXT_DROP; + u32 next1 = ILA_SIR2ILA_NEXT_DROP; + BVT (clib_bihash_kv) kv0, value0; + BVT (clib_bihash_kv) kv1, value1; + ila_entry_t *ie0 = &ila_sir2ila_default_entry; + ila_entry_t *ie1 = &ila_sir2ila_default_entry; + ip6_address_t *ila_address0, *ila_address1; + + { + vlib_buffer_t *p2, *p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + CLIB_PREFETCH (p2->data, sizeof (ip6_header_t), LOAD); + CLIB_PREFETCH (p3->data, sizeof (ip6_header_t), LOAD); + } + + pi0 = to_next[0] = from[0]; + pi1 = to_next[1] = from[1]; + from += 2; + n_left_from -= 2; + to_next += 2; + n_left_to_next -= 2; + + p0 = vlib_get_buffer (vm, pi0); + p1 = vlib_get_buffer (vm, pi1); + ip60 = vlib_buffer_get_current (p0); + ip61 = vlib_buffer_get_current (p1); + ila_address0 = &ip60->dst_address; + ila_address1 = &ip61->dst_address; + kv0.key[0] = ip60->dst_address.as_u64[0]; + kv0.key[1] = ip60->dst_address.as_u64[1]; + kv0.key[2] = 0; + kv1.key[0] = ip61->dst_address.as_u64[0]; + kv1.key[1] = ip61->dst_address.as_u64[1]; + kv1.key[2] = 0; + + if (PREDICT_TRUE((BV (clib_bihash_search) + (&ilm->id_to_entry_table, &kv0, &value0)) == 0)) { + ie0 = &ilm->entries[value0.value]; + ila_address0 = (ie0->dir != ILA_DIR_ILA2SIR) ? &ie0->ila_address : ila_address0; + } + + if ((BV (clib_bihash_search) + (&ilm->id_to_entry_table, &kv1, &value1)) == 0) { + ie1 = &ilm->entries[value1.value]; + ila_address1 = (ie1->dir != ILA_DIR_ILA2SIR) ? &ie1->ila_address : ila_address1; + } + + if (PREDICT_FALSE (p0->flags & VLIB_BUFFER_IS_TRACED)) + { + ila_sir2ila_trace_t *tr = + vlib_add_trace (vm, node, p0, sizeof (*tr)); + tr->ila_index = + (ie0 != &ila_sir2ila_default_entry) ? (ie0 - ilm->entries) : ~0; + tr->initial_dst = ip60->dst_address; + } + + if (PREDICT_FALSE (p1->flags & VLIB_BUFFER_IS_TRACED)) + { + ila_sir2ila_trace_t *tr = + vlib_add_trace (vm, node, p1, sizeof (*tr)); + tr->ila_index = + (ie1 != &ila_sir2ila_default_entry) ? (ie1 - ilm->entries) : ~0; + tr->initial_dst = ip61->dst_address; + } + + ip60->dst_address.as_u64[0] = ila_address0->as_u64[0]; + ip60->dst_address.as_u64[1] = ila_address0->as_u64[1]; + ip61->dst_address.as_u64[0] = ila_address1->as_u64[0]; + ip61->dst_address.as_u64[1] = ila_address1->as_u64[1]; + + vnet_feature_next (vnet_buffer (p0)->sw_if_index[VLIB_RX], &next0, p0); + vnet_feature_next (vnet_buffer (p1)->sw_if_index[VLIB_RX], &next1, p1); + + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, to_next, + n_left_to_next, pi0, pi1, next0, + next1); + } + + /* Single loop */ + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 pi0; + vlib_buffer_t *p0; + ip6_header_t *ip60; + u32 next0 = ILA_SIR2ILA_NEXT_DROP; + BVT (clib_bihash_kv) kv0, value0; + ila_entry_t *ie0 = &ila_sir2ila_default_entry; + ip6_address_t *ila_address0; + + pi0 = to_next[0] = from[0]; + from += 1; + n_left_from -= 1; + to_next += 1; + n_left_to_next -= 1; + + p0 = vlib_get_buffer (vm, pi0); + ip60 = vlib_buffer_get_current (p0); + ila_address0 = &ip60->dst_address; + + kv0.key[0] = ip60->dst_address.as_u64[0]; + kv0.key[1] = ip60->dst_address.as_u64[1]; + kv0.key[2] = 0; + + if (PREDICT_TRUE((BV (clib_bihash_search) + (&ilm->id_to_entry_table, &kv0, &value0)) == 0)) { + ie0 = &ilm->entries[value0.value]; + ila_address0 = (ie0->dir != ILA_DIR_ILA2SIR) ? &ie0->ila_address : ila_address0; + } + + if (PREDICT_FALSE (p0->flags & VLIB_BUFFER_IS_TRACED)) + { + ila_sir2ila_trace_t *tr = + vlib_add_trace (vm, node, p0, sizeof (*tr)); + tr->ila_index = + (ie0 != &ila_sir2ila_default_entry) ? (ie0 - ilm->entries) : ~0; + tr->initial_dst = ip60->dst_address; + } + + //This operation should do everything for any type (except vnid4 obviously) + ip60->dst_address.as_u64[0] = ila_address0->as_u64[0]; + ip60->dst_address.as_u64[1] = ila_address0->as_u64[1]; + + vnet_feature_next (vnet_buffer (p0)->sw_if_index[VLIB_RX], &next0, p0); + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, pi0, next0); + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + return frame->n_vectors; +} + +/** *INDENT-OFF* */ +VLIB_REGISTER_NODE (ila_sir2ila_node, static) = +{ + .function = ila_sir2ila,.name = "sir-to-ila", + .vector_size = sizeof (u32), + .format_trace = format_ila_sir2ila_trace, + .n_errors = ILA_N_ERROR, + .error_strings = ila_error_strings, + .n_next_nodes = ILA_SIR2ILA_N_NEXT, + .next_nodes = + { + [ILA_SIR2ILA_NEXT_DROP] = "error-drop" + }, +}; +/** *INDENT-ON* */ + +/** *INDENT-OFF* */ +VNET_FEATURE_INIT (ila_sir2ila, static) = +{ + .arc_name = "ip6-unicast", + .node_name = "sir-to-ila", + .runs_before = VNET_FEATURES ("ip6-lookup"), +}; +/** *INDENT-ON* */ + +static void +ila_entry_stack (ila_entry_t *ie) +{ + /* + * restack on the next-hop's FIB entry + */ + dpo_stack(ila_dpo_type, + DPO_PROTO_IP6, + &ie->ila_dpo, + fib_entry_contribute_ip_forwarding( + ie->next_hop_fib_entry_index)); +} + +int +ila_add_del_entry (ila_add_del_entry_args_t * args) +{ + ila_main_t *ilm = &ila_main; + BVT (clib_bihash_kv) kv, value; + + //Sanity check + if (args->type == ILA_TYPE_IID || args->type == ILA_TYPE_LUID) + { + if ((args->sir_address.as_u8[8] >> 5) != args->type) + { + clib_warning ("Incorrect SIR address (ILA type mismatch %d %d)", + args->sir_address.as_u8[8] >> 1, args->type); + return -1; + } + if (args->sir_address.as_u8[8] & 0x10) + { + clib_warning ("Checksum bit should not be set in SIR address"); + return -1; + } + } + else if (args->type == ILA_TYPE_VNIDM) + { + if (args->sir_address.as_u8[0] != 0xff || + (args->sir_address.as_u8[1] & 0xf0) != 0xf0) + { + clib_warning ("SIR multicast address must start with fff"); + return -1; + } + if (args->sir_address.as_u16[1] || args->sir_address.as_u16[2] || + args->sir_address.as_u16[3] || args->sir_address.as_u16[4] || + args->sir_address.as_u16[5] || (args->sir_address.as_u8[12] & 0xf0)) + { + clib_warning ("SIR multicast address must start with fff"); + return -1; + } + } + + if (!args->is_del) + { + ila_entry_t *e; + pool_get (ilm->entries, e); + e->type = args->type; + e->sir_address = args->sir_address; + e->next_hop = args->next_hop_address; + e->csum_mode = args->csum_mode; + e->dir = args->dir; + + //Construct ILA address + switch (e->type) + { + case ILA_TYPE_IID: + e->ila_address = e->sir_address; + break; + case ILA_TYPE_LUID: + e->ila_address.as_u64[0] = args->locator; + e->ila_address.as_u64[1] = args->sir_address.as_u64[1]; + break; + case ILA_TYPE_VNID6: + e->ila_address.as_u64[0] = args->locator; + e->ila_address.as_u8[8] = (ILA_TYPE_VNID6 << 1); + e->ila_address.as_u32[2] |= args->vnid; + e->ila_address.as_u32[3] = args->sir_address.as_u32[3]; + break; + case ILA_TYPE_VNIDM: + e->ila_address.as_u64[0] = args->locator; + e->ila_address.as_u8[8] = (ILA_TYPE_VNIDM << 1); + e->ila_address.as_u32[2] |= args->vnid; + e->ila_address.as_u32[3] = args->sir_address.as_u32[3]; + e->ila_address.as_u8[12] |= args->sir_address.as_u8[2] << 4; + break; + case ILA_TYPE_VNID4: + clib_warning ("ILA type '%U' is not supported", format_ila_type, + e->type); + return -1; + } + + //Modify ILA checksum if necessary + if (e->csum_mode == ILA_CSUM_MODE_NEUTRAL_MAP) + { + ip_csum_t csum = e->ila_address.as_u16[7]; + int i; + for (i = 0; i < 4; i++) + { + csum = ip_csum_sub_even (csum, e->sir_address.as_u32[i]); + csum = ip_csum_add_even (csum, e->ila_address.as_u32[i]); + } + csum = ip_csum_add_even (csum, clib_host_to_net_u16 (0x1000)); + e->ila_address.as_u16[7] = ip_csum_fold (csum); + e->ila_address.as_u8[8] |= 0x10; + } + + //Create entry with the sir address + kv.key[0] = e->sir_address.as_u64[0]; + kv.key[1] = e->sir_address.as_u64[1]; + kv.key[2] = 0; + kv.value = e - ilm->entries; + BV (clib_bihash_add_del) (&ilm->id_to_entry_table, &kv, + 1 /* is_add */ ); + + if (!ip6_address_is_zero(&e->next_hop)) + { + /* + * become a child of the FIB netry for the next-hop + * so we are informed when its forwarding changes + */ + fib_prefix_t next_hop = { + .fp_addr = { + .ip6 = e->next_hop, + }, + .fp_len = 128, + .fp_proto = FIB_PROTOCOL_IP6, + }; + + e->next_hop_fib_entry_index = + fib_table_entry_special_add(0, + &next_hop, + FIB_SOURCE_RR, + FIB_ENTRY_FLAG_NONE); + e->next_hop_child_index = + fib_entry_child_add(e->next_hop_fib_entry_index, + ila_fib_node_type, + e - ilm->entries); + + /* + * Create a route that results in the ILA entry + */ + dpo_id_t dpo = DPO_INVALID; + fib_prefix_t pfx = { + .fp_addr = { + .ip6 = e->ila_address, + }, + .fp_len = 128, + .fp_proto = FIB_PROTOCOL_IP6, + }; + + dpo_set(&dpo, ila_dpo_type, DPO_PROTO_IP6, e - ilm->entries); + + fib_table_entry_special_dpo_add(0, + &pfx, + FIB_SOURCE_PLUGIN_HI, + FIB_ENTRY_FLAG_EXCLUSIVE, + &dpo); + dpo_reset(&dpo); + + /* + * finally stack the ILA entry so it will forward to the next-hop + */ + ila_entry_stack (e); + } + } + else + { + ila_entry_t *e; + kv.key[0] = args->sir_address.as_u64[0]; + kv.key[1] = args->sir_address.as_u64[1]; + kv.key[2] = 0; + + if ((BV (clib_bihash_search) (&ilm->id_to_entry_table, &kv, &value) < + 0)) + { + return -1; + } + + e = &ilm->entries[value.value]; + + if (!ip6_address_is_zero(&e->next_hop)) + { + fib_prefix_t pfx = { + .fp_addr = { + .ip6 = e->ila_address, + }, + .fp_len = 128, + .fp_proto = FIB_PROTOCOL_IP6, + }; + + fib_table_entry_special_remove(0, &pfx, FIB_SOURCE_PLUGIN_HI); + /* + * remove this ILA entry as child of the FIB netry for the next-hop + */ + fib_entry_child_remove(e->next_hop_fib_entry_index, + e->next_hop_child_index); + fib_table_entry_delete_index(e->next_hop_fib_entry_index, + FIB_SOURCE_RR); + e->next_hop_fib_entry_index = FIB_NODE_INDEX_INVALID; + } + dpo_reset (&e->ila_dpo); + + BV (clib_bihash_add_del) (&ilm->id_to_entry_table, &kv, + 0 /* is_add */ ); + pool_put (ilm->entries, e); + } + return 0; +} + +int +ila_interface (u32 sw_if_index, u8 disable) +{ + vnet_feature_enable_disable ("ip4-unicast", "sir-to-ila", sw_if_index, + !disable, 0, 0); + return 0; +} + +/* *INDENT-OFF* */ +VLIB_PLUGIN_REGISTER () = { + .version = VPP_BUILD_VER, + .description = "Identifier-locator addressing for IPv6", +}; +/* *INDENT-ON* */ + +u8 *format_ila_dpo (u8 * s, va_list * va) +{ + index_t index = va_arg (*va, index_t); + CLIB_UNUSED(u32 indent) = va_arg (*va, u32); + ila_main_t *ilm = &ila_main; + ila_entry_t *ie = pool_elt_at_index (ilm->entries, index); + return format(s, "ILA: idx:%d sir:%U", + index, + format_ip6_address, &ie->sir_address); +} + +/** + * @brief no-op lock function. + * The lifetime of the ILA entry is managed by the control plane + */ +static void +ila_dpo_lock (dpo_id_t *dpo) +{ +} + +/** + * @brief no-op unlock function. + * The lifetime of the ILA entry is managed by the control plane + */ +static void +ila_dpo_unlock (dpo_id_t *dpo) +{ +} + +const static dpo_vft_t ila_vft = { + .dv_lock = ila_dpo_lock, + .dv_unlock = ila_dpo_unlock, + .dv_format = format_ila_dpo, +}; +const static char* const ila_ip6_nodes[] = +{ + "ila-to-sir", + NULL, +}; +const static char* const * const ila_nodes[DPO_PROTO_NUM] = +{ + [DPO_PROTO_IP6] = ila_ip6_nodes, +}; + +static fib_node_t * +ila_fib_node_get_node (fib_node_index_t index) +{ + ila_main_t *ilm = &ila_main; + ila_entry_t *ie = pool_elt_at_index (ilm->entries, index); + + return (&ie->ila_fib_node); +} + +/** + * @brief no-op unlock function. + * The lifetime of the ILA entry is managed by the control plane + */ +static void +ila_fib_node_last_lock_gone (fib_node_t *node) +{ +} + +static ila_entry_t * +ila_entry_from_fib_node (fib_node_t *node) +{ + return ((ila_entry_t*)(((char*)node) - + STRUCT_OFFSET_OF(ila_entry_t, ila_fib_node))); +} + +/** + * @brief + * Callback function invoked when the forwarding changes for the ILA next-hop + */ +static fib_node_back_walk_rc_t +ila_fib_node_back_walk_notify (fib_node_t *node, + fib_node_back_walk_ctx_t *ctx) +{ + ila_entry_stack(ila_entry_from_fib_node(node)); + + return (FIB_NODE_BACK_WALK_CONTINUE); +} + +/* + * ILA's FIB graph node virtual function table + */ +static const fib_node_vft_t ila_fib_node_vft = { + .fnv_get = ila_fib_node_get_node, + .fnv_last_lock = ila_fib_node_last_lock_gone, + .fnv_back_walk = ila_fib_node_back_walk_notify, +}; + +clib_error_t * +ila_init (vlib_main_t * vm) +{ + ila_main_t *ilm = &ila_main; + ilm->entries = NULL; + + ilm->lookup_table_nbuckets = ILA_TABLE_DEFAULT_HASH_NUM_BUCKETS; + ilm->lookup_table_nbuckets = 1 << max_log2 (ilm->lookup_table_nbuckets); + ilm->lookup_table_size = ILA_TABLE_DEFAULT_HASH_MEMORY_SIZE; + + BV (clib_bihash_init) (&ilm->id_to_entry_table, + "ila id to entry index table", + ilm->lookup_table_nbuckets, ilm->lookup_table_size); + + ila_dpo_type = dpo_register_new_type(&ila_vft, ila_nodes); + ila_fib_node_type = fib_node_register_new_type(&ila_fib_node_vft); + + return NULL; +} + +VLIB_INIT_FUNCTION (ila_init); + +static clib_error_t * +ila_entry_command_fn (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + ila_add_del_entry_args_t args = { 0 }; + u8 next_hop_set = 0; + int ret; + clib_error_t *error = 0; + + args.type = ILA_TYPE_IID; + args.csum_mode = ILA_CSUM_MODE_NO_ACTION; + args.local_adj_index = ~0; + args.dir = ILA_DIR_BIDIR; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "type %U", unformat_ila_type, &args.type)) + ; + else if (unformat + (line_input, "sir-address %U", unformat_ip6_address, + &args.sir_address)) + ; + else if (unformat + (line_input, "locator %U", unformat_half_ip6_address, + &args.locator)) + ; + else if (unformat + (line_input, "csum-mode %U", unformat_ila_csum_mode, + &args.csum_mode)) + ; + else if (unformat (line_input, "vnid %x", &args.vnid)) + ; + else if (unformat + (line_input, "next-hop %U", unformat_ip6_address, + &args.next_hop_address)) + ; + else if (unformat + (line_input, "direction %U", unformat_ila_direction, &args.dir)) + next_hop_set = 1; + else if (unformat (line_input, "del")) + args.is_del = 1; + else + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } + } + + if (!next_hop_set) + { + error = clib_error_return (0, "Specified a next hop"); + goto done; + } + + if ((ret = ila_add_del_entry (&args))) + { + error = clib_error_return (0, "ila_add_del_entry returned error %d", ret); + goto done; + } + +done: + unformat_free (line_input); + + return error; +} + +VLIB_CLI_COMMAND (ila_entry_command, static) = +{ + .path = "ila entry", + .short_help = "ila entry [type <type>] [sir-address <address>] [locator <locator>] [vnid <hex-vnid>]" + " [adj-index <adj-index>] [next-hop <next-hop>] [direction (bidir|sir2ila|ila2sir)]" + " [csum-mode (no-action|neutral-map|transport-adjust)] [del]", + .function = ila_entry_command_fn, +}; + +static clib_error_t * +ila_interface_command_fn (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + vnet_main_t *vnm = vnet_get_main (); + u32 sw_if_index = ~0; + u8 disable = 0; + + if (!unformat (input, "%U", unformat_vnet_sw_interface, vnm, &sw_if_index)) + { + return clib_error_return (0, "Invalid interface name"); + } + + if (unformat (input, "disable")) + { + disable = 1; + } + + int ret; + if ((ret = ila_interface (sw_if_index, disable))) + return clib_error_return (0, "ila_interface returned error %d", ret); + + return NULL; +} + +VLIB_CLI_COMMAND (ila_interface_command, static) = +{ + .path = "ila interface", + .short_help = "ila interface <interface-name> [disable]", + .function = ila_interface_command_fn, +}; + +static clib_error_t * +ila_show_entries_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + vnet_main_t *vnm = vnet_get_main (); + ila_main_t *ilm = &ila_main; + ila_entry_t *e; + + vlib_cli_output (vm, " %U\n", format_ila_entry, vnm, NULL); + pool_foreach (e, ilm->entries, + ({ + vlib_cli_output (vm, " %U\n", format_ila_entry, vnm, e); + })); + + return NULL; +} + +VLIB_CLI_COMMAND (ila_show_entries_command, static) = +{ + .path = "show ila entries", + .short_help = "show ila entries", + .function = ila_show_entries_command_fn, +}; diff --git a/src/plugins/ila/ila.h b/src/plugins/ila/ila.h new file mode 100644 index 00000000..26620983 --- /dev/null +++ b/src/plugins/ila/ila.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ILA_H +#define ILA_H + +#include <vnet/vnet.h> +#include <vnet/ip/ip.h> +#include <vnet/fib/fib_node.h> + +#include <vppinfra/bihash_24_8.h> +#include <vppinfra/bihash_template.h> + +#define ila_foreach_type \ + _(IID, 0, "iid") \ + _(LUID, 1, "luid") \ + _(VNID4, 2, "vnid-ip4") \ + _(VNID6, 3, "vnid-ip6") \ + _(VNIDM, 4, "vnid-multicast") + +typedef enum { +#define _(i,n,s) ILA_TYPE_##i = n, + ila_foreach_type +#undef _ +} ila_type_t; + +#define ila_csum_foreach_type \ +_(NO_ACTION, 0, "no-action") \ +_(NEUTRAL_MAP, 1, "neutral-map") \ +_(ADJUST_TRANSPORT, 2, "adjust-transport") + +typedef enum { +#define _(i,n,s) ILA_CSUM_MODE_##i = n, + ila_csum_foreach_type +#undef _ + ILA_CSUM_N_TYPES +} ila_csum_mode_t; + +#define ila_foreach_direction \ +_(BIDIR, 0, "bidir") \ +_(SIR2ILA, 1, "sir2ila") \ +_(ILA2SIR, 2, "ila2sir") + +typedef enum { +#define _(i,n,s) ILA_DIR_##i = n, + ila_foreach_direction +#undef _ +} ila_direction_t; + +typedef struct { + /** + * Fib Node base class + */ + fib_node_t ila_fib_node; + ila_type_t type; + ip6_address_t sir_address; + ip6_address_t ila_address; + ip6_address_t next_hop; + ila_csum_mode_t csum_mode; + ila_direction_t dir; + + /** + * The FIB entry index for the next-hop + */ + fib_node_index_t next_hop_fib_entry_index; + + /** + * The child index on the FIB entry + */ + u32 next_hop_child_index; + + /** + * The next DPO in the grpah to follow + */ + dpo_id_t ila_dpo; +} ila_entry_t; + +typedef struct { + ila_entry_t *entries; //Pool of ILA entries + + u64 lookup_table_nbuckets; + u64 lookup_table_size; + clib_bihash_24_8_t id_to_entry_table; + + u32 ip6_lookup_next_index; +} ila_main_t; + + +typedef struct { + ila_type_t type; + ip6_address_t sir_address; + ip6_address_t next_hop_address; + u64 locator; + u32 vnid; + u32 local_adj_index; + ila_csum_mode_t csum_mode; + ila_direction_t dir; + u8 is_del; +} ila_add_del_entry_args_t; + +int ila_add_del_entry (ila_add_del_entry_args_t * args); +int ila_interface (u32 sw_if_index, u8 disable); + +#endif //ILA_H diff --git a/src/plugins/ioam.am b/src/plugins/ioam.am new file mode 100644 index 00000000..4ac69aac --- /dev/null +++ b/src/plugins/ioam.am @@ -0,0 +1,247 @@ +# Copyright (c) 2015 Cisco and/or its affiliates. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +######################################## +# iOAM Proof of Transit +######################################## + +IOAM_POT_SRC = \ + ioam/lib-pot/pot_util.c \ + ioam/encap/ip6_ioam_pot.c \ + ioam/lib-pot/pot_util.h \ + ioam/lib-pot/math64.h \ + ioam/lib-pot/pot_api.c + +IOAM_POT_NOINST_HDR = \ + ioam/lib-pot/pot_all_api_h.h \ + ioam/lib-pot/pot_msg_enum.h \ + ioam/lib-pot/pot.api.h \ + ioam/lib-pot/pot_util.h \ + ioam/lib-pot/math64.h + +IOAM_POT_API = ioam/lib-pot/pot.api + +ioam_pot_test_plugin_la_SOURCES = \ + ioam/lib-pot/pot_test.c \ + ioam/lib-pot/pot_plugin.api.h + +vppapitestplugins_LTLIBRARIES += ioam_pot_test_plugin.la + +######################################## +# iOAM trace export for IPv6 +######################################## + +IOAM_EXPORT_SRC = \ +ioam/export/ioam_export.c \ +ioam/export/node.c \ +ioam/export/ioam_export.api.h \ +ioam/export/ioam_export_thread.c + +IOAM_EXPORT_NOINST_HDR = \ + ioam/export/ioam_export_all_api_h.h \ + ioam/export/ioam_export_msg_enum.h \ + ioam/export/ioam_export.api.h + +IOAM_EXPORT_API = ioam/export/ioam_export.api + +ioam_export_test_plugin_la_SOURCES = \ + ioam/export/ioam_export_test.c \ + ioam/export/ioam_export_plugin.api.h + +vppapitestplugins_LTLIBRARIES += ioam_export_test_plugin.la + +######################################## +# iOAM Trace +######################################## +IOAM_TRACE_SRC = \ + ioam/lib-trace/trace_util.c \ + ioam/encap/ip6_ioam_trace.c \ + ioam/lib-trace/trace_api.c + +IOAM_TRACE_NOINST_HDR = \ + ioam/export/ioam_export_all_api_h.h \ + ioam/lib-trace/trace_all_api_h.h \ + ioam/lib-trace/trace_msg_enum.h \ + ioam/lib-trace/trace.api.h \ + ioam/lib-trace/trace_util.h \ + ioam/encap/ip6_ioam_trace.h \ + ioam/lib-trace/trace_config.h + +IOAM_TRACE_API = ioam/lib-trace/trace.api + +ioam_trace_test_plugin_la_SOURCES = \ + ioam/lib-trace/trace_test.c \ + ioam/lib-trace/trace_plugin.api.h + +vppapitestplugins_LTLIBRARIES += ioam_trace_test_plugin.la + +nobase_include_HEADERS += \ + ioam/lib-trace/trace_util.h \ + ioam/export-common/ioam_export.h + +######################################## +# VxLAN-GPE +######################################## +IOAM_VXLAN_GPE_SRC = \ + ioam/lib-vxlan-gpe/ioam_encap.c \ + ioam/lib-vxlan-gpe/ioam_decap.c \ + ioam/lib-vxlan-gpe/ioam_transit.c \ + ioam/lib-vxlan-gpe/ioam_pop.c \ + ioam/lib-vxlan-gpe/vxlan_gpe_api.c \ + ioam/lib-vxlan-gpe/vxlan_gpe_ioam_trace.c \ + ioam/lib-vxlan-gpe/vxlan_gpe_ioam.c \ + ioam/export-vxlan-gpe/vxlan_gpe_ioam_export.c \ + ioam/export-vxlan-gpe/vxlan_gpe_node.c \ + ioam/export-vxlan-gpe/vxlan_gpe_ioam_export.api.h\ + ioam/export-vxlan-gpe/vxlan_gpe_ioam_export_thread.c + +IOAM_VXLAN_GPE_NOINST_HDR = \ + ioam/export/ioam_export_all_api_h.h \ + ioam/lib-vxlan-gpe/vxlan_gpe_all_api_h.h \ + ioam/lib-vxlan-gpe/vxlan_gpe_msg_enum.h \ + ioam/lib-vxlan-gpe/ioam_vxlan_gpe.api.h \ + ioam/lib-vxlan-gpe/vxlan_gpe_ioam_util.h \ + ioam/lib-vxlan-gpe/vxlan_gpe_ioam_packet.h \ + ioam/lib-vxlan-gpe/vxlan_gpe_ioam.h \ + ioam/export-vxlan-gpe/vxlan_gpe_ioam_export_all_api_h.h \ + ioam/export-vxlan-gpe/vxlan_gpe_ioam_export_msg_enum.h \ + ioam/export-vxlan-gpe/vxlan_gpe_ioam_export.api.h + +IOAM_VXLAN_GPE_API = ioam/lib-vxlan-gpe/ioam_vxlan_gpe.api +IOAM_VXLAN_GPE_API += ioam/export-vxlan-gpe/vxlan_gpe_ioam_export.api + +ioam_vxlan_gpe_test_plugin_la_SOURCES = \ + ioam/lib-vxlan-gpe/vxlan_gpe_test.c \ + ioam/lib-vxlan-gpe/vxlan_gpe_plugin.api.h + +vppapitestplugins_LTLIBRARIES += ioam_vxlan_gpe_test_plugin.la + +vxlan_gpe_ioam_export_test_plugin_la_SOURCES = \ + ioam/export-vxlan-gpe/vxlan_gpe_ioam_export_test.c \ + ioam/export-vxlan-gpe/vxlan_gpe_ioam_export_plugin.api.h + +vppapitestplugins_LTLIBRARIES += vxlan_gpe_ioam_export_test_plugin.la + +######################################## +# iOAM E2E +######################################## + +IOAM_E2E_SRC = \ + ioam/encap/ip6_ioam_e2e.c \ + ioam/encap/ip6_ioam_seqno.c \ + ioam/lib-e2e/ioam_seqno_lib.c + +IOAM_E2E_NOINST_HDR = \ + ioam/encap/ip6_ioam_e2e.h \ + ioam/encap/ip6_ioam_seqno.h \ + ioam/lib-e2e/ioam_seqno_lib.h + +######################################## +# ipfix collector +######################################## + +IPFIX_COLLECTOR_SRC = \ + ioam/ipfixcollector/ipfixcollector.c \ + ioam/ipfixcollector/node.c \ + ioam/ipfixcollector/ipfixcollector.h + +######################################## +# iOAM Analyse +######################################## + +IOAM_ANALYSE_SRC = \ + ioam/analyse/ip6/ip6_ioam_analyse.c \ + ioam/analyse/ip6/node.c \ + ioam/analyse/ip6/ip6_ioam_analyse.h \ + ioam/analyse/ioam_summary_export.c \ + ioam/analyse/ioam_analyse.h \ + ioam/analyse/ioam_summary_export.h + +######################################## +# iOAM record cache and rewrite +######################################## + +IOAM_IP6_MANYCAST_SRC = \ +ioam/ip6/ioam_cache.c \ +ioam/ip6/ioam_cache_node.c \ +ioam/ip6/ioam_cache_tunnel_select_node.c \ +ioam/ip6/ioam_cache.api.h + +IOAM_IP6_MANYCAST_API = ioam/ip6/ioam_cache.api + +IOAM_IP6_MANYCAST_NOINST_HDR = \ + ioam/ip6/ioam_cache_all_api_h.h \ + ioam/ip6/ioam_cache_msg_enum.h \ + ioam/ip6/ioam_cache.api.h + +# udp ping +######################################## + +UDP_PING_SRC = \ + ioam/udp-ping/udp_ping_node.c \ + ioam/udp-ping/udp_ping_util.c \ + ioam/udp-ping/udp_ping_export.c \ + ioam/udp-ping/udp_ping_api.c + +UDP_PING_NOINST_HDR = \ + ioam/udp-ping/udp_ping_packet.h \ + ioam/udp-ping/udp_ping.h \ + ioam/udp-ping/udp_ping_util.h \ + ioam/udp-ping/udp_ping_all_api_h.h \ + ioam/udp-ping/udp_ping_msg_enum.h \ + ioam/udp-ping/udp_ping.api.h + +UDP_PING_API = ioam/udp-ping/udp_ping.api + +udp_ping_test_plugin_la_SOURCES = \ + ioam/udp-ping/udp_ping_test.c \ + ioam/udp-ping/udp_ping_plugin.api.h + +vppapitestplugins_LTLIBRARIES += udp_ping_test_plugin.la + +######################################## +# iOAM plugins +######################################## + +ioam_plugin_la_SOURCES = \ + $(IOAM_POT_SRC) \ + $(IOAM_EXPORT_SRC) \ + $(IOAM_TRACE_SRC) \ + $(IOAM_VXLAN_GPE_SRC) \ + $(IOAM_E2E_SRC) \ + $(IPFIX_COLLECTOR_SRC) \ + $(IOAM_ANALYSE_SRC) \ + $(IOAM_IP6_MANYCAST_SRC) \ + $(UDP_PING_SRC) + +API_FILES += \ + $(IOAM_POT_API) \ + $(IOAM_EXPORT_API) \ + $(IOAM_TRACE_API) \ + $(IOAM_VXLAN_GPE_API) \ + $(IOAM_IP6_MANYCAST_API) \ + $(UDP_PING_API) + +noinst_HEADERS += \ + $(IOAM_POT_NOINST_HDR) \ + $(IOAM_EXPORT_NOINST_HDR) \ + $(IOAM_TRACE_NOINST_HDR) \ + $(IOAM_VXLAN_GPE_NOINST_HDR) \ + $(IOAM_E2E_NOINST_HDR) \ + $(IOAM_IP6_MANYCAST_NOINST_HDR) \ + $(UDP_PING_NOINST_HDR) + +vppplugins_LTLIBRARIES += ioam_plugin.la + +# vi:syntax=automake diff --git a/src/plugins/ioam/analyse/ioam_analyse.h b/src/plugins/ioam/analyse/ioam_analyse.h new file mode 100644 index 00000000..ef2865da --- /dev/null +++ b/src/plugins/ioam/analyse/ioam_analyse.h @@ -0,0 +1,526 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PLUGINS_IOAM_PLUGIN_IOAM_ANALYSE_IOAM_ANALYSE_H_ +#define PLUGINS_IOAM_PLUGIN_IOAM_ANALYSE_IOAM_ANALYSE_H_ + +#include <vlib/vlib.h> +#include <vnet/vnet.h> +#include <vppinfra/types.h> +#include <ioam/lib-e2e/e2e_util.h> +#include <ioam/lib-trace/trace_util.h> +#include <ioam/lib-trace/trace_config.h> + +#define IOAM_FLOW_TEMPLATE_ID 260 +#define IOAM_TRACE_MAX_NODES 10 +#define IOAM_MAX_PATHS_PER_FLOW 10 + +typedef struct +{ + u16 ingress_if; + u16 egress_if; + u32 node_id; + u32 state_up; +} ioam_path_map_t; + +/** @brief Analysed iOAM trace data. + @note cache aligned. +*/ +typedef struct +{ + /** No of nodes in path. */ + u8 num_nodes; + + /** Data contained in trace - NodeId, TTL, Ingress & Egress Link, Timestamp. */ + u8 trace_type; + + /** Flag to indicate whether node is allocated. */ + u8 is_free; + + u8 pad[5]; + + /** Actual PATH flow has taken. */ + ioam_path_map_t path[IOAM_TRACE_MAX_NODES]; + + /** Num of pkts in the flow going over path. */ + u32 pkt_counter; + + /** Num of bytes in the flow going over path. */ + u32 bytes_counter; + + /** Minumum Dealay for the flow. */ + u32 min_delay; + + /** Maximum Dealay for the flow. */ + u32 max_delay; + + /** Average Dealay for the flow. */ + u32 mean_delay; + + u32 reserve; +} ioam_analyse_trace_record; + +typedef struct +{ + ioam_analyse_trace_record path_data[IOAM_MAX_PATHS_PER_FLOW]; +} ioam_analyse_trace_data; + +/** @brief Analysed iOAM pot data. + @note cache aligned. +*/ +typedef struct +{ + /** Number of packets validated (passes through the service chain) + within the timestamps. */ + u32 sfc_validated_count; + + /** Number of packets invalidated (failed through the service chain) + within the timestamps. */ + u32 sfc_invalidated_count; +} ioam_analyse_pot_data; + +/** @brief Analysed iOAM data. + @note cache aligned. +*/ +typedef struct ioam_analyser_data_t_ +{ + u8 is_free; + u8 pad[3]; + + /** Num of pkts sent for this flow. */ + u32 pkt_sent; + + /** Num of pkts matching this flow. */ + u32 pkt_counter; + + /** Num of bytes matching this flow. */ + u32 bytes_counter; + + /** Analysed iOAM trace data. */ + ioam_analyse_trace_data trace_data; + + /** Analysed iOAM pot data. */ + ioam_analyse_pot_data pot_data; + + /** Analysed iOAM seqno data. */ + seqno_rx_info seqno_data; + + /** Cache of previously analysed data, useful for export. */ + struct ioam_analyser_data_t_ *chached_data_list; + + /** Lock to since we use this to export the data in other thread. */ + volatile u32 *writer_lock; +} ioam_analyser_data_t; + +always_inline f64 +ip6_ioam_analyse_calc_delay (ioam_trace_hdr_t * trace, u16 trace_len, + u8 oneway) +{ + u16 size_of_all_traceopts; + u8 size_of_traceopt_per_node; + u8 num_nodes; + u32 *start_elt, *end_elt, *uturn_elt;; + u32 start_time, end_time; + u8 done = 0; + + size_of_traceopt_per_node = fetch_trace_data_size (trace->ioam_trace_type); + // Unknown trace type + if (size_of_traceopt_per_node == 0) + return 0; + size_of_all_traceopts = trace_len; /*ioam_trace_type,data_list_elts_left */ + + num_nodes = (u8) (size_of_all_traceopts / size_of_traceopt_per_node); + if ((num_nodes == 0) || (num_nodes <= trace->data_list_elts_left)) + return 0; + + num_nodes -= trace->data_list_elts_left; + + start_elt = trace->elts; + end_elt = + trace->elts + + (u32) ((size_of_traceopt_per_node / sizeof (u32)) * (num_nodes - 1)); + + if (oneway && (trace->ioam_trace_type & BIT_TTL_NODEID)) + { + done = 0; + do + { + uturn_elt = start_elt - size_of_traceopt_per_node / sizeof (u32); + + if ((clib_net_to_host_u32 (*start_elt) >> 24) <= + (clib_net_to_host_u32 (*uturn_elt) >> 24)) + done = 1; + } + while (!done && (start_elt = uturn_elt) != end_elt); + } + if (trace->ioam_trace_type & BIT_TTL_NODEID) + { + start_elt++; + end_elt++; + } + if (trace->ioam_trace_type & BIT_ING_INTERFACE) + { + start_elt++; + end_elt++; + } + start_time = clib_net_to_host_u32 (*start_elt); + end_time = clib_net_to_host_u32 (*end_elt); + + return (f64) (end_time - start_time); +} + +always_inline void +ip6_ioam_analyse_set_paths_down (ioam_analyser_data_t * data) +{ + ioam_analyse_trace_data *trace_data; + ioam_analyse_trace_record *trace_record; + ioam_path_map_t *path; + u8 k, i; + + while (__sync_lock_test_and_set (data->writer_lock, 1)) + ; + + trace_data = &data->trace_data; + + for (i = 0; i < IOAM_MAX_PATHS_PER_FLOW; i++) + { + trace_record = trace_data->path_data + i; + + if (trace_record->is_free) + continue; + + path = trace_record->path; + + for (k = 0; k < trace_record->num_nodes; k++) + path[k].state_up = 0; + } + *(data->writer_lock) = 0; +} + +always_inline void +ip6_ioam_analyse_hbh_trace_loopback (ioam_analyser_data_t * data, + ioam_trace_hdr_t * trace, u16 trace_len) +{ + ioam_analyse_trace_data *trace_data; + ioam_analyse_trace_record *trace_record; + ioam_path_map_t *path; + u8 i, j, k, num_nodes, max_nodes; + u8 *ptr; + u32 nodeid; + u16 ingress_if, egress_if; + u16 size_of_traceopt_per_node; + u16 size_of_all_traceopts; + + while (__sync_lock_test_and_set (data->writer_lock, 1)) + ; + + trace_data = &data->trace_data; + + size_of_traceopt_per_node = fetch_trace_data_size (trace->ioam_trace_type); + if (0 == size_of_traceopt_per_node) + goto end; + + size_of_all_traceopts = trace_len; + + ptr = (u8 *) trace->elts; + max_nodes = (u8) (size_of_all_traceopts / size_of_traceopt_per_node); + num_nodes = max_nodes - trace->data_list_elts_left; + + for (i = 0; i < IOAM_MAX_PATHS_PER_FLOW; i++) + { + trace_record = trace_data->path_data + i; + path = trace_record->path; + + if (trace_record->is_free) + continue; + + for (j = max_nodes, k = 0; k < num_nodes; j--, k++) + { + ptr = + (u8 *) ((u8 *) trace->elts + + (size_of_traceopt_per_node * (j - 1))); + + nodeid = clib_net_to_host_u32 (*((u32 *) ptr)) & 0x00ffffff; + ptr += 4; + + if (nodeid != path[k].node_id) + goto end; + + if ((trace->ioam_trace_type == TRACE_TYPE_IF_TS_APP) || + (trace->ioam_trace_type == TRACE_TYPE_IF)) + { + ingress_if = clib_net_to_host_u16 (*((u16 *) ptr)); + ptr += 2; + egress_if = clib_net_to_host_u16 (*((u16 *) ptr)); + if ((ingress_if != path[k].ingress_if) || + (egress_if != path[k].egress_if)) + { + goto end; + } + } + /* Found Match - set path hop state to up */ + path[k].state_up = 1; + } + } +end: + *(data->writer_lock) = 0; +} + +always_inline int +ip6_ioam_analyse_hbh_trace (ioam_analyser_data_t * data, + ioam_trace_hdr_t * trace, u16 pak_len, + u16 trace_len) +{ + ioam_analyse_trace_data *trace_data; + u16 size_of_traceopt_per_node; + u16 size_of_all_traceopts; + u8 i, j, k, num_nodes, max_nodes; + u8 *ptr; + u32 nodeid; + u16 ingress_if, egress_if; + ioam_path_map_t *path = NULL; + ioam_analyse_trace_record *trace_record; + + while (__sync_lock_test_and_set (data->writer_lock, 1)) + ; + + trace_data = &data->trace_data; + + size_of_traceopt_per_node = fetch_trace_data_size (trace->ioam_trace_type); + // Unknown trace type + if (size_of_traceopt_per_node == 0) + goto DONE; + size_of_all_traceopts = trace_len; + + ptr = (u8 *) trace->elts; + max_nodes = (u8) (size_of_all_traceopts / size_of_traceopt_per_node); + num_nodes = max_nodes - trace->data_list_elts_left; + + for (i = 0; i < IOAM_MAX_PATHS_PER_FLOW; i++) + { + trace_record = trace_data->path_data + i; + + if (trace_record->is_free || + (num_nodes != trace_record->num_nodes) || + (trace->ioam_trace_type != trace_record->trace_type)) + continue; + + path = trace_record->path; + + for (j = max_nodes, k = 0; k < num_nodes; j--, k++) + { + ptr = + (u8 *) ((u8 *) trace->elts + + (size_of_traceopt_per_node * (j - 1))); + + nodeid = clib_net_to_host_u32 (*((u32 *) ptr)) & 0x00ffffff; + ptr += 4; + + if (nodeid != path[k].node_id) + break; + + if ((trace->ioam_trace_type == TRACE_TYPE_IF_TS_APP) || + (trace->ioam_trace_type == TRACE_TYPE_IF)) + { + ingress_if = clib_net_to_host_u16 (*((u16 *) ptr)); + ptr += 2; + egress_if = clib_net_to_host_u16 (*((u16 *) ptr)); + if ((ingress_if != path[k].ingress_if) || + (egress_if != path[k].egress_if)) + { + break; + } + } + } + + if (k == num_nodes) + { + goto found_match; + } + } + + for (i = 0; i < IOAM_MAX_PATHS_PER_FLOW; i++) + { + trace_record = trace_data->path_data + i; + if (trace_record->is_free) + { + trace_record->is_free = 0; + trace_record->num_nodes = num_nodes; + trace_record->trace_type = trace->ioam_trace_type; + path = trace_data->path_data[i].path; + trace_record->pkt_counter = 0; + trace_record->bytes_counter = 0; + trace_record->min_delay = 0xFFFFFFFF; + trace_record->max_delay = 0; + trace_record->mean_delay = 0; + break; + } + } + + for (j = max_nodes, k = 0; k < num_nodes; j--, k++) + { + ptr = + (u8 *) ((u8 *) trace->elts + (size_of_traceopt_per_node * (j - 1))); + + path[k].node_id = clib_net_to_host_u32 (*((u32 *) ptr)) & 0x00ffffff; + ptr += 4; + + if ((trace->ioam_trace_type == TRACE_TYPE_IF_TS_APP) || + (trace->ioam_trace_type == TRACE_TYPE_IF)) + { + path[k].ingress_if = clib_net_to_host_u16 (*((u16 *) ptr)); + ptr += 2; + path[k].egress_if = clib_net_to_host_u16 (*((u16 *) ptr)); + } + } + +found_match: + /* Set path state to UP */ + for (k = 0; k < num_nodes; k++) + path[k].state_up = 1; + + trace_record->pkt_counter++; + trace_record->bytes_counter += pak_len; + if (trace->ioam_trace_type & BIT_TIMESTAMP) + { + /* Calculate time delay */ + u32 delay = (u32) ip6_ioam_analyse_calc_delay (trace, trace_len, 0); + if (delay < trace_record->min_delay) + trace_record->min_delay = delay; + else if (delay > trace_record->max_delay) + trace_record->max_delay = delay; + + u64 sum = (trace_record->mean_delay * data->seqno_data.rx_packets); + trace_record->mean_delay = + (u32) ((sum + delay) / (data->seqno_data.rx_packets + 1)); + } +DONE: + *(data->writer_lock) = 0; + return 0; +} + +always_inline int +ip6_ioam_analyse_hbh_e2e (ioam_analyser_data_t * data, + ioam_e2e_packet_t * e2e, u16 len) +{ + while (__sync_lock_test_and_set (data->writer_lock, 1)) + ; + + ioam_analyze_seqno (&data->seqno_data, + (u64) clib_net_to_host_u32 (e2e->e2e_data)); + + *(data->writer_lock) = 0; + return 0; +} + +always_inline u8 * +format_path_map (u8 * s, va_list * args) +{ + ioam_path_map_t *pm = va_arg (*args, ioam_path_map_t *); + u32 num_of_elts = va_arg (*args, u32); + u32 i; + + for (i = 0; i < num_of_elts; i++) + { + s = + format (s, + "node_id: 0x%x, ingress_if: 0x%x, egress_if:0x%x, state:%s\n", + pm->node_id, pm->ingress_if, pm->egress_if, + pm->state_up ? "UP" : "DOWN"); + pm++; + } + + return (s); +} + +always_inline u8 * +print_analyse_flow (u8 * s, ioam_analyser_data_t * record) +{ + int j; + ioam_analyse_trace_record *trace_record; + + s = format (s, "pkt_sent : %u\n", record->pkt_sent); + s = format (s, "pkt_counter : %u\n", record->pkt_counter); + s = format (s, "bytes_counter : %u\n", record->bytes_counter); + + s = format (s, "Trace data: \n"); + + for (j = 0; j < IOAM_MAX_PATHS_PER_FLOW; j++) + { + trace_record = record->trace_data.path_data + j; + if (trace_record->is_free) + continue; + + s = format (s, "path_map:\n%U", format_path_map, + trace_record->path, trace_record->num_nodes); + s = format (s, "pkt_counter: %u\n", trace_record->pkt_counter); + s = format (s, "bytes_counter: %u\n", trace_record->bytes_counter); + + s = format (s, "min_delay: %u\n", trace_record->min_delay); + s = format (s, "max_delay: %u\n", trace_record->max_delay); + s = format (s, "mean_delay: %u\n", trace_record->mean_delay); + } + + s = format (s, "\nPOT data: \n"); + s = format (s, "sfc_validated_count : %u\n", + record->pot_data.sfc_validated_count); + s = format (s, "sfc_invalidated_count : %u\n", + record->pot_data.sfc_invalidated_count); + + s = format (s, "\nSeqno Data:\n"); + s = format (s, + "RX Packets : %lu\n" + "Lost Packets : %lu\n" + "Duplicate Packets : %lu\n" + "Reordered Packets : %lu\n", + record->seqno_data.rx_packets, + record->seqno_data.lost_packets, + record->seqno_data.dup_packets, + record->seqno_data.reordered_packets); + + s = format (s, "\n"); + return s; +} + +always_inline void +ioam_analyse_init_data (ioam_analyser_data_t * data) +{ + u16 j; + ioam_analyse_trace_data *trace_data; + + data->is_free = 1; + + /* We maintain data corresponding to last IP-Fix export, this may + * get extended in future to maintain history of data */ + vec_validate_aligned (data->chached_data_list, 0, CLIB_CACHE_LINE_BYTES); + + data->writer_lock = clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES, + CLIB_CACHE_LINE_BYTES); + *(data->writer_lock) = 0; + + trace_data = &(data->trace_data); + for (j = 0; j < IOAM_MAX_PATHS_PER_FLOW; j++) + trace_data->path_data[j].is_free = 1; +} + +#endif /* PLUGINS_IOAM_PLUGIN_IOAM_ANALYSE_IOAM_ANALYSE_H_ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/analyse/ioam_summary_export.c b/src/plugins/ioam/analyse/ioam_summary_export.c new file mode 100644 index 00000000..af2d39ab --- /dev/null +++ b/src/plugins/ioam/analyse/ioam_summary_export.c @@ -0,0 +1,444 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vlib/vlib.h> +#include <vnet/ip/ip6_packet.h> +#include <ioam/analyse/ioam_summary_export.h> +#include <ioam/analyse/ip6/ip6_ioam_analyse.h> + +u8 * +ioam_template_rewrite (flow_report_main_t * frm, flow_report_t * fr, + ip4_address_t * collector_address, + ip4_address_t * src_address, u16 collector_port) +{ + ip4_header_t *ip; + udp_header_t *udp; + ipfix_message_header_t *h; + ipfix_set_header_t *s; + ipfix_template_header_t *t; + ipfix_field_specifier_t *f; + ipfix_field_specifier_t *first_field; + u8 *rewrite = 0; + ip4_ipfix_template_packet_t *tp; + u32 field_count = 0; + u32 field_index = 0; + flow_report_stream_t *stream; + + stream = &frm->streams[fr->stream_index]; + + /* Determine field count */ +#define _(field,mask,item,length) \ + { \ + field_count++; \ + fr->fields_to_send = clib_bitmap_set (fr->fields_to_send, \ + field_index, 1); \ + } \ + field_index++; + + foreach_ioam_ipfix_field; +#undef _ + + /* Add Src address, dest address, src port, dest port + * path map, number of paths manually */ + field_count += 6; + + /* allocate rewrite space */ + vec_validate_aligned (rewrite, + sizeof (ip4_ipfix_template_packet_t) + + field_count * sizeof (ipfix_field_specifier_t) - 1, + CLIB_CACHE_LINE_BYTES); + + tp = (ip4_ipfix_template_packet_t *) rewrite; + ip = (ip4_header_t *) & tp->ip4; + udp = (udp_header_t *) (ip + 1); + h = (ipfix_message_header_t *) (udp + 1); + s = (ipfix_set_header_t *) (h + 1); + t = (ipfix_template_header_t *) (s + 1); + first_field = f = (ipfix_field_specifier_t *) (t + 1); + + ip->ip_version_and_header_length = 0x45; + ip->ttl = 254; + ip->protocol = IP_PROTOCOL_UDP; + ip->src_address.as_u32 = src_address->as_u32; + ip->dst_address.as_u32 = collector_address->as_u32; + udp->src_port = clib_host_to_net_u16 (collector_port); + udp->dst_port = clib_host_to_net_u16 (UDP_DST_PORT_ipfix); + udp->length = clib_host_to_net_u16 (vec_len (rewrite) - sizeof (*ip)); + + h->domain_id = clib_host_to_net_u32 (stream->domain_id); //fr->domain_id); + + /* Add Src address, dest address, src port, dest port + * path map, number of paths manually */ + f->e_id_length = ipfix_e_id_length (0 /* enterprise */ , + sourceIPv6Address, + sizeof (ip6_address_t)); + f++; + + f->e_id_length = ipfix_e_id_length (0 /* enterprise */ , + destinationIPv6Address, + sizeof (ip6_address_t)); + f++; + + f->e_id_length = ipfix_e_id_length (0 /* enterprise */ , + sourceTransportPort, 2); + f++; + + f->e_id_length = ipfix_e_id_length (0 /* enterprise */ , + destinationTransportPort, 2); + f++; + +#define _(field,mask,item,length) \ + { \ + f->e_id_length = ipfix_e_id_length (0 /* enterprise */, \ + item, length); \ + f++; \ + } + foreach_ioam_ipfix_field; +#undef _ + + f->e_id_length = ipfix_e_id_length (0 /* enterprise */ , + ioamNumberOfPaths, 2); + f++; + + /* Add ioamPathMap manually */ + f->e_id_length = ipfix_e_id_length (0 /* enterprise */ , + ioamPathMap, + (sizeof (ioam_path) + + (sizeof (ioam_path_map_t) * + IOAM_TRACE_MAX_NODES))); + f++; + + /* Back to the template packet... */ + ip = (ip4_header_t *) & tp->ip4; + udp = (udp_header_t *) (ip + 1); + + ASSERT (f - first_field); + /* Field count in this template */ + t->id_count = ipfix_id_count (IOAM_FLOW_TEMPLATE_ID, f - first_field); + + /* set length in octets */ + s->set_id_length = + ipfix_set_id_length (2 /* set_id */ , (u8 *) f - (u8 *) s); + + /* message length in octets */ + h->version_length = version_length ((u8 *) f - (u8 *) h); + + ip->length = clib_host_to_net_u16 ((u8 *) f - (u8 *) ip); + ip->checksum = ip4_header_checksum (ip); + + return rewrite; +} + +u16 +ioam_analyse_add_ipfix_record (flow_report_t * fr, + ioam_analyser_data_t * record, + vlib_buffer_t * b0, u16 offset, + ip6_address_t * src, ip6_address_t * dst, + u16 src_port, u16 dst_port) +{ + while (__sync_lock_test_and_set (record->writer_lock, 1)) + ; + + int field_index = 0; + u16 tmp; + int i, j; + u16 num_paths = 0; + u16 num_paths_offset; + + + /* Add IPv6 source address manually */ + memcpy (b0->data + offset, &src->as_u64[0], sizeof (u64)); + offset += sizeof (u64); + memcpy (b0->data + offset, &src->as_u64[1], sizeof (u64)); + offset += sizeof (u64); + + /* Add IPv6 destination address manually */ + memcpy (b0->data + offset, &dst->as_u64[0], sizeof (u64)); + offset += sizeof (u64); + memcpy (b0->data + offset, &dst->as_u64[1], sizeof (u64)); + offset += sizeof (u64); + + /* Add source port manually */ + tmp = clib_host_to_net_u16 (src_port); + memcpy (b0->data + offset, &tmp, sizeof (u16)); + offset += sizeof (u16); + + /* Add dest port manually */ + tmp = clib_host_to_net_u16 (dst_port); + memcpy (b0->data + offset, &tmp, sizeof (u16)); + offset += sizeof (u16); + +#define _(field,mask,item,length) \ + if (clib_bitmap_get (fr->fields_to_send, field_index)) \ + { \ + /* Expect only 4 bytes */ \ + u32 tmp; \ + tmp = clib_host_to_net_u32((u32)record->field - (u32)record->chached_data_list->field);\ + memcpy (b0->data + offset, &tmp, length); \ + offset += length; \ + } + field_index++; + foreach_ioam_ipfix_field; +#undef _ + + /* Store num_paths_offset here and update later */ + num_paths_offset = offset; + offset += sizeof (u16); + + /* Add ioamPathMap manually */ + for (i = 0; i < IOAM_MAX_PATHS_PER_FLOW; i++) + { + ioam_analyse_trace_record *trace = record->trace_data.path_data + i; + ioam_analyse_trace_record *trace_cached = + record->chached_data_list->trace_data.path_data + i; + ioam_path *path = (ioam_path *) (b0->data + offset); + + if (!trace->is_free) + { + num_paths++; + + path->num_nodes = trace->num_nodes; + + path->trace_type = trace->trace_type; + if (0 < (trace->pkt_counter - trace_cached->pkt_counter)) + { + u64 new_sum = trace->mean_delay * record->seqno_data.rx_packets; + u64 old_sum = + trace_cached->mean_delay * + record->chached_data_list->seqno_data.rx_packets; + path->mean_delay = + (u32) ((new_sum - old_sum) / (trace->pkt_counter - + trace_cached->pkt_counter)); + path->mean_delay = clib_host_to_net_u32 (path->mean_delay); + } + else + path->mean_delay = 0; + + path->bytes_counter = + trace->bytes_counter - trace_cached->bytes_counter; + path->bytes_counter = clib_host_to_net_u32 (path->bytes_counter); + + path->pkt_counter = trace->pkt_counter - trace_cached->pkt_counter; + path->pkt_counter = clib_host_to_net_u32 (path->pkt_counter); + offset += sizeof (ioam_path); + + for (j = 0; j < trace->num_nodes; j++) + { + path->path[j].node_id = + clib_host_to_net_u32 (trace->path[j].node_id); + path->path[j].ingress_if = + clib_host_to_net_u16 (trace->path[j].ingress_if); + path->path[j].egress_if = + clib_host_to_net_u16 (trace->path[j].egress_if); + path->path[j].state_up = trace->path[j].state_up; + } + + //offset += (sizeof(ioam_path_map_t) * trace->num_nodes); + offset += (sizeof (ioam_path_map_t) * IOAM_TRACE_MAX_NODES); //FIXME + } + } + + num_paths = clib_host_to_net_u16 (num_paths); + memcpy (b0->data + num_paths_offset, &num_paths, sizeof (u16)); + + /* Update cache */ + *(record->chached_data_list) = *record; + record->chached_data_list->chached_data_list = NULL; + + *(record->writer_lock) = 0; + return offset; +} + +vlib_frame_t * +ioam_send_flows (flow_report_main_t * frm, flow_report_t * fr, + vlib_frame_t * f, u32 * to_next, u32 node_index) +{ + vlib_buffer_t *b0 = NULL; + u32 next_offset = 0; + u32 bi0 = ~0; + int i; + ip4_ipfix_template_packet_t *tp; + ipfix_message_header_t *h; + ipfix_set_header_t *s = NULL; + ip4_header_t *ip; + udp_header_t *udp; + u32 records_this_buffer; + u16 new_l0, old_l0; + ip_csum_t sum0; + vlib_main_t *vm = frm->vlib_main; + ip6_address_t temp; + ioam_analyser_data_t *record = NULL; + flow_report_stream_t *stream; + ioam_analyser_data_t *aggregated_data; + u16 data_len; + + stream = &frm->streams[fr->stream_index]; + + memset (&temp, 0, sizeof (ip6_address_t)); + + aggregated_data = ioam_analyser_main.aggregated_data; + data_len = vec_len (aggregated_data); + + vec_foreach_index (i, aggregated_data) + { + u8 flush = 0; + record = aggregated_data + i; + + /* Flush if last entry */ + if (i == (data_len - 1)) + flush = 1; + + if (!record->is_free) + { + + if (PREDICT_FALSE (b0 == NULL)) + { + if (vlib_buffer_alloc (vm, &bi0, 1) != 1) + break; + + b0 = vlib_get_buffer (vm, bi0); + memcpy (b0->data, fr->rewrite, vec_len (fr->rewrite)); + b0->current_data = 0; + b0->current_length = vec_len (fr->rewrite); + b0->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID; + vnet_buffer (b0)->sw_if_index[VLIB_RX] = 0; + vnet_buffer (b0)->sw_if_index[VLIB_TX] = ~0; + + tp = vlib_buffer_get_current (b0); + ip = &tp->ip4; + h = &tp->ipfix.h; + s = &tp->ipfix.s; + + /* FIXUP: message header export_time */ + h->export_time = clib_host_to_net_u32 (((u32) time (NULL))); + + /* FIXUP: message header sequence_number */ + h->sequence_number = stream->sequence_number++; + h->sequence_number = clib_host_to_net_u32 (h->sequence_number); + next_offset = (u32) (((u8 *) (s + 1)) - (u8 *) tp); + records_this_buffer = 0; + } + + next_offset = ioam_analyse_add_ipfix_record (fr, record, + b0, next_offset, + &temp, &temp, 0, 0); + records_this_buffer++; + + /* Flush data if packet len is about to reach path mtu */ + if (next_offset > (frm->path_mtu - 250)) + flush = 1; + } + + if (PREDICT_FALSE (flush && b0)) + { + s->set_id_length = ipfix_set_id_length (IOAM_FLOW_TEMPLATE_ID, + next_offset - (sizeof (*ip) + + sizeof (*udp) + + sizeof (*h))); + b0->current_length = next_offset; + b0->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID; + tp = vlib_buffer_get_current (b0); + ip = (ip4_header_t *) & tp->ip4; + udp = (udp_header_t *) (ip + 1); + + sum0 = ip->checksum; + old_l0 = ip->length; + new_l0 = clib_host_to_net_u16 ((u16) next_offset); + sum0 = ip_csum_update (sum0, old_l0, new_l0, ip4_header_t, + length /* changed member */ ); + + ip->checksum = ip_csum_fold (sum0); + ip->length = new_l0; + udp->length = + clib_host_to_net_u16 (b0->current_length - sizeof (*ip)); + + if (frm->udp_checksum) + { + /* RFC 7011 section 10.3.2. */ + udp->checksum = ip4_tcp_udp_compute_checksum (vm, b0, ip); + if (udp->checksum == 0) + udp->checksum = 0xffff; + } + + to_next[0] = bi0; + f->n_vectors++; + to_next++; + + if (f->n_vectors == VLIB_FRAME_SIZE) + { + vlib_put_frame_to_node (vm, node_index, f); + f = vlib_get_frame_to_node (vm, node_index); + f->n_vectors = 0; + to_next = vlib_frame_vector_args (f); + } + b0 = 0; + bi0 = ~0; + } + } + + return f; +} + +clib_error_t * +ioam_flow_create (u8 del) +{ + vnet_flow_report_add_del_args_t args; + int rv; + u32 domain_id = 0; + flow_report_main_t *frm = &flow_report_main; + u16 template_id; + + memset (&args, 0, sizeof (args)); + args.rewrite_callback = ioam_template_rewrite; + args.flow_data_callback = ioam_send_flows; + del ? (args.is_add = 0) : (args.is_add = 1); + args.domain_id = domain_id; + + rv = vnet_flow_report_add_del (frm, &args, &template_id); + + switch (rv) + { + case 0: + break; + case VNET_API_ERROR_NO_SUCH_ENTRY: + return clib_error_return (0, "registration not found..."); + default: + return clib_error_return (0, "vnet_flow_report_add_del returned %d", + rv); + } + + return 0; +} + +clib_error_t * +ioam_flow_report_init (vlib_main_t * vm) +{ + clib_error_t *error; + + if ((error = vlib_call_init_function (vm, flow_report_init))) + return error; + + return 0; +} + +VLIB_INIT_FUNCTION (ioam_flow_report_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/analyse/ioam_summary_export.h b/src/plugins/ioam/analyse/ioam_summary_export.h new file mode 100755 index 00000000..b4355061 --- /dev/null +++ b/src/plugins/ioam/analyse/ioam_summary_export.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_ip6_ioam_flow_report_h__ +#define __included_ip6_ioam_flow_report_h__ + +#include <ioam/analyse/ioam_analyse.h> +#include <vnet/flow/flow_report.h> + +#define foreach_ioam_ipfix_info_element \ +_(ioamPacketSent, 5239, u32) \ +_(ioamPacketCount, 5237, u32) \ +_(ioamByteCount, 5238, u32) \ +_(ioamPathMap, 5262, u32) \ +_(ioamNumberOfPaths, 5264, u16) \ +_(ioamSfcValidatedCount, 5278, u32) \ +_(ioamSfcInValidatedCount, 5279, u32) \ +_(ioamSeqnoRxCount, 5280, u32) \ +_(ioamSeqnoLostCount, 5281, u32) \ +_(ioamSeqnoReorderedCount, 5282, u32) \ +_(ioamSeqnoDupCount, 5283, u32) + + +typedef enum +{ +#define _(n,v,t) n = v, + foreach_ioam_ipfix_info_element +#undef _ +} ioam_ipfix_info_element_id_t; + +#define foreach_ioam_ipfix_field \ +_(pkt_sent, 0xffffffff, ioamPacketSent, 4) \ +_(pkt_counter, 0xffffffff, ioamPacketCount, 4) \ +_(bytes_counter, 0xffffffff, ioamByteCount, 4) \ +_(pot_data.sfc_validated_count, 0xffffffff, ioamSfcValidatedCount, 4) \ +_(pot_data.sfc_invalidated_count, 0xffffffff, ioamSfcInValidatedCount, 4) \ +_(seqno_data.rx_packets, 0xffffffff, ioamSeqnoRxCount, 4) \ +_(seqno_data.lost_packets, 0xffffffff, ioamSeqnoLostCount, 4) \ +_(seqno_data.reordered_packets, 0xffffffff, ioamSeqnoReorderedCount, 4) \ +_(seqno_data.dup_packets, 0xffffffff, ioamSeqnoDupCount, 4) + +clib_error_t *ioam_flow_report_init (vlib_main_t * vm); + +typedef struct +{ + u8 num_nodes; + u8 trace_type; + u16 reserve; + u32 mean_delay; + u32 pkt_counter; + u32 bytes_counter; + ioam_path_map_t path[0]; +} ioam_path; + +clib_error_t *ioam_flow_create (u8 del); + +u8 *ioam_template_rewrite (flow_report_main_t * frm, flow_report_t * fr, + ip4_address_t * collector_address, + ip4_address_t * src_address, u16 collector_port); + +u16 ioam_analyse_add_ipfix_record (flow_report_t * fr, + ioam_analyser_data_t * record, + vlib_buffer_t * b0, u16 offset, + ip6_address_t * src, ip6_address_t * dst, + u16 src_port, u16 dst_port); + +#endif /* __included_ip6_ioam_flow_report_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/analyse/ip6/ip6_ioam_analyse.c b/src/plugins/ioam/analyse/ip6/ip6_ioam_analyse.c new file mode 100644 index 00000000..39442b62 --- /dev/null +++ b/src/plugins/ioam/analyse/ip6/ip6_ioam_analyse.c @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <ioam/analyse/ioam_analyse.h> +#include <ioam/export-common/ioam_export.h> +#include <ioam/analyse/ip6/ip6_ioam_analyse.h> +#include <ioam/analyse/ioam_summary_export.h> +#include <vnet/ip/ip.h> +#include <ioam/ipfixcollector/ipfixcollector.h> + +extern ioam_export_main_t ioam_export_main; +static clib_error_t * +ioam_analyse_enable_disable (vlib_main_t * vm, + int is_add, int is_export, int remote_listen) +{ + ipfix_client_add_del_t ipfix_reg; + clib_error_t *rv = 0; + + ipfix_reg.client_name = format (0, "ip6-hbh-analyse-remote"); + ipfix_reg.client_node = analyse_node_remote.index; + ipfix_reg.ipfix_setid = IPFIX_IOAM_EXPORT_ID; + + if (is_export) + { + rv = ioam_flow_create (!is_add); + if (rv) + goto ret; + } + + if (is_add) + { + ip6_ioam_analyse_register_handlers (); + if (remote_listen) + { + ipfix_reg.del = 0; + ipfix_collector_reg_setid (vm, &ipfix_reg); + } + else + { + ioam_export_set_next_node (&ioam_export_main, + (u8 *) "ip6-hbh-analyse-local"); + } + } + else + { + ip6_ioam_analyse_unregister_handlers (); + if (remote_listen) + { + ipfix_reg.del = 1; + ipfix_collector_reg_setid (vm, &ipfix_reg); + } + else + ioam_export_reset_next_node (&ioam_export_main); + } + +ret: + vec_free (ipfix_reg.client_name); + return rv; +} + +static clib_error_t * +set_ioam_analyse_command_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + int is_export = 0; + int is_add = 1; + int remote_listen = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "export-ipfix-collector")) + is_export = 1; + else if (unformat (input, "disable")) + is_add = 0; + else if (unformat (input, "listen-ipfix")) + remote_listen = 1; + else + break; + } + + return (ioam_analyse_enable_disable (vm, is_add, is_export, remote_listen)); +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (set_ioam_analyse_command, static) = { + .path = "set ioam analyse", + .short_help = "set ioam analyse [export-ipfix-collector] [disable] [listen-ipfix]", + .function = set_ioam_analyse_command_fn, +}; +/* *INDENT-ON* */ + +static clib_error_t * +show_ioam_analyse_cmd_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + ip6_ioam_analyser_main_t *am = &ioam_analyser_main; + ioam_analyser_data_t *record = NULL; + u8 i; + u8 *s = 0; + + vec_reset_length (s); + s = format (0, "iOAM Analyse Information: \n"); + vec_foreach_index (i, am->aggregated_data) + { + record = am->aggregated_data + i; + if (record->is_free) + continue; + + s = format (s, "Flow Number: %u\n", i); + s = print_analyse_flow (s, record); + s = format (s, "\n"); + } + vlib_cli_output (vm, "%v", s); + + vec_free (s); + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (ip6_show_ioam_ipfix_cmd, static) = { + .path = "show ioam analyse ", + .short_help = "show ioam analyser information", + .function = show_ioam_analyse_cmd_fn, +}; +/* *INDENT-ON* */ + +static clib_error_t * +ioam_analyse_init (vlib_main_t * vm) +{ + ip6_ioam_analyser_main_t *am = &ioam_analyser_main; + u16 i; + + vec_validate_aligned (am->aggregated_data, 50, CLIB_CACHE_LINE_BYTES); + vec_foreach_index (i, am->aggregated_data) + { + ioam_analyse_init_data (am->aggregated_data + i); + } + + return 0; +} + +VLIB_INIT_FUNCTION (ioam_analyse_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/analyse/ip6/ip6_ioam_analyse.h b/src/plugins/ioam/analyse/ip6/ip6_ioam_analyse.h new file mode 100644 index 00000000..5a2a2d70 --- /dev/null +++ b/src/plugins/ioam/analyse/ip6/ip6_ioam_analyse.h @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PLUGINS_IOAM_PLUGIN_IOAM_ANALYSE_IP6_IOAM_ANALYSE_NODE_H_ +#define PLUGINS_IOAM_PLUGIN_IOAM_ANALYSE_IP6_IOAM_ANALYSE_NODE_H_ + +#include <ioam/analyse/ioam_analyse.h> +#include <vnet/ip/ip6_hop_by_hop.h> +#include <ioam/encap/ip6_ioam_trace.h> + +/** @brief IP6-iOAM analyser main structure. + @note cache aligned. +*/ +typedef struct +{ + /** Array of function pointer to analyse each hop-by-hop option. */ + int (*analyse_hbh_handler[MAX_IP6_HBH_OPTION]) (u32 flow_id, + ip6_hop_by_hop_option_t * + opt, u16 len); + + /** This contains the aggregated data from the time VPP started analysing. */ + ioam_analyser_data_t *aggregated_data; + +} ip6_ioam_analyser_main_t; + +extern ip6_ioam_analyser_main_t ioam_analyser_main; + +extern vlib_node_registration_t analyse_node_local; +extern vlib_node_registration_t analyse_node_remote; + +void ip6_ioam_analyse_register_handlers (void); + +void ip6_ioam_analyse_unregister_handlers (void); + +clib_error_t *ip6_ioam_analyse_init (vlib_main_t * vm); + +inline static ioam_analyser_data_t * +ioam_analyse_get_data_from_flow_id (u32 flow_id) +{ + if (flow_id >= vec_len (ioam_analyser_main.aggregated_data)) + return NULL; + + if (ioam_analyser_main.aggregated_data[flow_id].is_free) + ioam_analyser_main.aggregated_data[flow_id].is_free = 0; + + return (ioam_analyser_main.aggregated_data + flow_id); +} + +always_inline void * +ip6_ioam_find_hbh_option (ip6_hop_by_hop_header_t * hbh0, u8 option) +{ + ip6_hop_by_hop_option_t *opt0, *limit0; + u8 type0; + + opt0 = (ip6_hop_by_hop_option_t *) (hbh0 + 1); + limit0 = + (ip6_hop_by_hop_option_t *) ((u8 *) hbh0 + ((hbh0->length + 1) << 3)); + + while (opt0 < limit0) + { + type0 = opt0->type; + if (type0 == option) + return ((void *) opt0); + + if (0 == type0) + { + opt0 = (ip6_hop_by_hop_option_t *) ((u8 *) opt0) + 1; + continue; + } + opt0 = (ip6_hop_by_hop_option_t *) + (((u8 *) opt0) + opt0->length + sizeof (ip6_hop_by_hop_option_t)); + } + + return NULL; +} + +always_inline int +ip6_ioam_analyse_compare_path_delay (ip6_hop_by_hop_header_t * hbh0, + ip6_hop_by_hop_header_t * hbh1, + bool oneway) +{ + ioam_trace_option_t *trace0 = NULL, *trace1 = NULL; + f64 delay0, delay1; + + trace0 = + ip6_ioam_find_hbh_option (hbh0, HBH_OPTION_TYPE_IOAM_TRACE_DATA_LIST); + trace1 = + ip6_ioam_find_hbh_option (hbh1, HBH_OPTION_TYPE_IOAM_TRACE_DATA_LIST); + + if (PREDICT_FALSE ((trace0 == NULL) && (trace1 == NULL))) + return 0; + + if (PREDICT_FALSE (trace1 == NULL)) + return 1; + + if (PREDICT_FALSE (trace0 == NULL)) + return -1; + + delay0 = ip6_ioam_analyse_calc_delay (&trace0->trace_hdr, + trace0->hdr.length - 2, oneway); + delay1 = ip6_ioam_analyse_calc_delay (&trace1->trace_hdr, + trace1->hdr.length - 2, oneway); + + return (delay0 - delay1); +} + +#endif /* PLUGINS_IOAM_PLUGIN_IOAM_ANALYSE_IP6_IOAM_ANALYSE_NODE_H_ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/analyse/ip6/node.c b/src/plugins/ioam/analyse/ip6/node.c new file mode 100644 index 00000000..6db6355e --- /dev/null +++ b/src/plugins/ioam/analyse/ip6/node.c @@ -0,0 +1,523 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vlib/vlib.h> +#include <vnet/vnet.h> +#include <vppinfra/error.h> +#include <vnet/ip/ip.h> +#include <ioam/export-common/ioam_export.h> +#include <ioam/encap/ip6_ioam_trace.h> +#include <ioam/encap/ip6_ioam_pot.h> +#include <ioam/lib-pot/pot_util.h> +#include <ioam/encap/ip6_ioam_e2e.h> +#include <ioam/analyse/ioam_analyse.h> +#include <ioam/analyse/ip6/ip6_ioam_analyse.h> +#include <vnet/plugin/plugin.h> + +typedef struct +{ + u32 next_index; + u32 flow_id; +} analyse_trace_t; + +vlib_node_registration_t analyse_node_local; +vlib_node_registration_t analyse_node_remote; + +#define foreach_analyse_error \ +_(ANALYSED, "Packets analysed for summarization") \ +_(FAILED, "Packets analysis failed") \ + +typedef enum +{ +#define _(sym,str) ANALYSE_ERROR_##sym, + foreach_analyse_error +#undef _ + ANALYSE_N_ERROR, +} analyse_error_t; + +static char *analyse_error_strings[] = { +#define _(sym,string) string, + foreach_analyse_error +#undef _ +}; + +typedef enum +{ + ANALYSE_NEXT_IP4_LOOKUP, + ANALYSE_NEXT_IP4_DROP, + ANALYSE_N_NEXT, +} analyse_next_t; + +ip6_ioam_analyser_main_t ioam_analyser_main; + +/* packet trace format function */ +static u8 * +format_analyse_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + analyse_trace_t *t = va_arg (*args, analyse_trace_t *); + + s = format (s, "IP6-ioam-analyse: flow_id %d, next index %d", + t->flow_id, t->next_index); + return s; +} + +always_inline u8 +ioam_analyse_hbh (u32 flow_id, + ip6_hop_by_hop_header_t * hbh0, + ip6_hop_by_hop_option_t * opt0, + ip6_hop_by_hop_option_t * limit0, u16 len) +{ + ip6_ioam_analyser_main_t *am = &ioam_analyser_main; + u8 type0; + u8 error0 = 0; + + while (opt0 < limit0) + { + type0 = opt0->type; + switch (type0) + { + case 0: /* Pad1 */ + opt0 = (ip6_hop_by_hop_option_t *) ((u8 *) opt0) + 1; + continue; + case 1: /* PadN */ + break; + default: + if (am->analyse_hbh_handler[type0]) + { + if (PREDICT_TRUE + ((*am->analyse_hbh_handler[type0]) (flow_id, opt0, + len) < 0)) + { + error0 = ANALYSE_ERROR_FAILED; + return (error0); + } + } + } + opt0 = + (ip6_hop_by_hop_option_t *) (((u8 *) opt0) + opt0->length + + sizeof (ip6_hop_by_hop_option_t)); + } + return (error0); +} + +/** + * @brief IPv6 InBandOAM Analyse node. + * @node ip6-hbh-analyse-local, ip6-hbh-analyse-remote + * + * This function receives IP-FIX packets containing IPv6-iOAM records, analyses + * them and collects/aggregates the statistics. + * + * @param vm vlib_main_t corresponding to the current thread. + * @param node vlib_node_runtime_t data for this node. + * @param frame vlib_frame_t whose contents should be dispatched. + * + * @par Graph mechanics: buffer, next index usage + * + * <em>Uses:</em> + * - <code>vlib_buffer_get_current(p0)</code> + * - Walks on each ioam record present in IP-Fix record, analyse them and + * store the statistics. + * + * <em>Next Index:</em> + * - Dispatches the packet to ip4-lookup if executed under ip6-hbh-analyse-local + * node context and to ip4-drop if executed under ip6-hbh-analyse-remote node + * context. + */ +static uword +ip6_ioam_analyse_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + u32 n_left_from, *from, *to_next; + analyse_next_t next_index; + u32 pkts_analysed = 0; + u32 pkts_failed = 0; + u8 remote = 0; + u32 next0 = ANALYSE_NEXT_IP4_LOOKUP; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + if (PREDICT_FALSE (analyse_node_remote.index == node->node_index)) + { + remote = 1; + next0 = ANALYSE_NEXT_IP4_DROP; + } + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *p0; + ip4_header_t *ip40; + u8 *data, *limit; + u16 num_ioam_records; + + /* speculatively enqueue p0 to the current next frame */ + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + p0 = vlib_get_buffer (vm, bi0); + if (PREDICT_FALSE (remote)) + { + vlib_buffer_advance (p0, -(word) (sizeof (udp_header_t) + + sizeof (ip4_header_t) + + sizeof + (ipfix_message_header_t) + + sizeof (ipfix_set_header_t))); + } + data = (u8 *) vlib_buffer_get_current (p0); + ip40 = (ip4_header_t *) vlib_buffer_get_current (p0); + limit = data + clib_net_to_host_u16 (ip40->length); + data += sizeof (ip4_header_t) + sizeof (udp_header_t) + + sizeof (ipfix_message_header_t) + sizeof (ipfix_set_header_t); + + num_ioam_records = (limit - data) / DEFAULT_EXPORT_SIZE; + + while (num_ioam_records >= 4) + { + /* Prefetch next 2 ioam records */ + { + CLIB_PREFETCH (data + (2 * DEFAULT_EXPORT_SIZE), + (DEFAULT_EXPORT_SIZE), LOAD); + CLIB_PREFETCH (data + (3 * DEFAULT_EXPORT_SIZE), + (DEFAULT_EXPORT_SIZE), LOAD); + } + + num_ioam_records -= 2; + + ip6_header_t *ip60, *ip61; + ip6_hop_by_hop_header_t *hbh0, *hbh1; + ip6_hop_by_hop_option_t *opt0, *limit0, *opt1, *limit1; + u32 flow_id0, flow_id1; + u8 error0, error1; + ioam_analyser_data_t *data0, *data1; + u16 p_len0, p_len1; + + ip60 = (ip6_header_t *) data; + ip61 = (ip6_header_t *) (data + DEFAULT_EXPORT_SIZE); + + data += (2 * DEFAULT_EXPORT_SIZE); + + hbh0 = (ip6_hop_by_hop_header_t *) (ip60 + 1); + hbh1 = (ip6_hop_by_hop_header_t *) (ip61 + 1); + + opt0 = (ip6_hop_by_hop_option_t *) (hbh0 + 1); + opt1 = (ip6_hop_by_hop_option_t *) (hbh1 + 1); + + limit0 = + (ip6_hop_by_hop_option_t *) ((u8 *) hbh0 + + ((hbh0->length + 1) << 3)); + limit1 = + (ip6_hop_by_hop_option_t *) ((u8 *) hbh1 + + ((hbh1->length + 1) << 3)); + + flow_id0 = + clib_net_to_host_u32 + (ip60->ip_version_traffic_class_and_flow_label) & 0xFFFFF; + flow_id1 = + clib_net_to_host_u32 + (ip61->ip_version_traffic_class_and_flow_label) & 0xFFFFF; + + p_len0 = clib_net_to_host_u16 (ip60->payload_length); + p_len1 = clib_net_to_host_u16 (ip61->payload_length); + + error0 = + ioam_analyse_hbh (flow_id0, hbh0, opt0, limit0, p_len0); + error1 = + ioam_analyse_hbh (flow_id1, hbh1, opt1, limit1, p_len0); + + if (PREDICT_TRUE ((error0 == 0) && (error1 == 0))) + { + pkts_analysed += 2; + data0 = ioam_analyse_get_data_from_flow_id (flow_id0); + data1 = ioam_analyse_get_data_from_flow_id (flow_id1); + + while (__sync_lock_test_and_set (data0->writer_lock, 1)) + ; + data0->pkt_counter++; + data0->bytes_counter += p_len0; + *(data0->writer_lock) = 0; + + while (__sync_lock_test_and_set (data1->writer_lock, 1)) + ; + data1->pkt_counter++; + data1->bytes_counter += p_len1; + *(data1->writer_lock) = 0; + } + else if (error0 == 0) + { + pkts_analysed++; + pkts_failed++; + + data0 = ioam_analyse_get_data_from_flow_id (flow_id0); + while (__sync_lock_test_and_set (data0->writer_lock, 1)) + ; + data0->pkt_counter++; + data0->bytes_counter += p_len0; + *(data0->writer_lock) = 0; + } + else if (error1 == 0) + { + pkts_analysed++; + pkts_failed++; + + data1 = ioam_analyse_get_data_from_flow_id (flow_id1); + while (__sync_lock_test_and_set (data1->writer_lock, 1)) + ; + data1->pkt_counter++; + data1->bytes_counter += p_len1; + *(data1->writer_lock) = 0; + } + else + pkts_failed += 2; + } + + while (num_ioam_records > 0) + { + num_ioam_records--; + + ip6_header_t *ip60; + ip6_hop_by_hop_header_t *hbh0; + ip6_hop_by_hop_option_t *opt0, *limit0; + u32 flow_id0; + u8 error0; + ioam_analyser_data_t *data0; + u16 p_len0; + + ip60 = (ip6_header_t *) data; + data += (1 * DEFAULT_EXPORT_SIZE); + hbh0 = (ip6_hop_by_hop_header_t *) (ip60 + 1); + opt0 = (ip6_hop_by_hop_option_t *) (hbh0 + 1); + limit0 = + (ip6_hop_by_hop_option_t *) ((u8 *) hbh0 + + ((hbh0->length + 1) << 3)); + + flow_id0 = + clib_net_to_host_u32 + (ip60->ip_version_traffic_class_and_flow_label) & 0xFFFFF; + p_len0 = clib_net_to_host_u16 (ip60->payload_length); + error0 = + ioam_analyse_hbh (flow_id0, hbh0, opt0, limit0, p_len0); + + if (PREDICT_TRUE (error0 == 0)) + { + pkts_analysed++; + data0 = ioam_analyse_get_data_from_flow_id (flow_id0); + while (__sync_lock_test_and_set (data0->writer_lock, 1)) + ; + data0->pkt_counter++; + data0->bytes_counter += + clib_net_to_host_u16 (ip60->payload_length); + *(data0->writer_lock) = 0; + } + else + pkts_failed++; + } + + /* verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + vlib_node_increment_counter (vm, node->node_index, ANALYSE_ERROR_ANALYSED, + pkts_analysed); + + if (PREDICT_FALSE (pkts_failed)) + vlib_node_increment_counter (vm, node->node_index, ANALYSE_ERROR_FAILED, + pkts_failed); + + return frame->n_vectors; +} + +int +ip6_ioam_analyse_hbh_trace_internal (u32 flow_id, + ip6_hop_by_hop_option_t * opt, u16 len) +{ + ioam_analyser_data_t *data; + ioam_trace_option_t *trace = (ioam_trace_option_t *) opt; + + data = ioam_analyse_get_data_from_flow_id (flow_id); + ASSERT (data != NULL); + + (void) ip6_ioam_analyse_hbh_trace (data, &trace->trace_hdr, len, + (trace->hdr.length - 2) + /*ioam_trace_type,data_list_elts_left */ + ); + return 0; +} + +int +ip6_ioam_analyse_hbh_pot (u32 flow_id, ip6_hop_by_hop_option_t * opt0, + u16 len) +{ + + ioam_pot_option_t *pot0; + u64 random = 0; + u64 cumulative = 0; + pot_profile *pot_profile = 0; + int ret; + ioam_analyser_data_t *data; + + data = ioam_analyse_get_data_from_flow_id (flow_id); + + pot0 = (ioam_pot_option_t *) opt0; + random = clib_net_to_host_u64 (pot0->random); + cumulative = clib_net_to_host_u64 (pot0->cumulative); + pot_profile = pot_profile_get_active (); + ret = pot_validate (pot_profile, cumulative, random); + + while (__sync_lock_test_and_set (data->writer_lock, 1)) + ; + + (0 == ret) ? (data->pot_data.sfc_validated_count++) : + (data->pot_data.sfc_invalidated_count++); + + *(data->writer_lock) = 0; + return 0; +} + +int +ip6_ioam_analyse_hbh_e2e_internal (u32 flow_id, ip6_hop_by_hop_option_t * opt, + u16 len) +{ + ioam_analyser_data_t *data; + ioam_e2e_option_t *e2e; + + data = ioam_analyse_get_data_from_flow_id (flow_id); + e2e = (ioam_e2e_option_t *) opt; + ip6_ioam_analyse_hbh_e2e (data, &e2e->e2e_hdr, len); + return 0; +} + +int +ip6_ioam_analyse_register_hbh_handler (u8 option, + int options (u32 flow_id, + ip6_hop_by_hop_option_t * + opt, u16 len)) +{ + ip6_ioam_analyser_main_t *am = &ioam_analyser_main; + + ASSERT (option < ARRAY_LEN (am->analyse_hbh_handler)); + + /* Already registered */ + if (am->analyse_hbh_handler[option]) + return (-1); + + am->analyse_hbh_handler[option] = options; + + return (0); +} + +int +ip6_ioam_analyse_unregister_hbh_handler (u8 option) +{ + ip6_ioam_analyser_main_t *am = &ioam_analyser_main; + + ASSERT (option < ARRAY_LEN (am->analyse_hbh_handler)); + + /* Not registered */ + if (!am->analyse_hbh_handler[option]) + return (-1); + + am->analyse_hbh_handler[option] = NULL; + return (0); +} + +void +ip6_ioam_analyse_register_handlers () +{ + ip6_ioam_analyse_register_hbh_handler (HBH_OPTION_TYPE_IOAM_TRACE_DATA_LIST, + ip6_ioam_analyse_hbh_trace_internal); + ip6_ioam_analyse_register_hbh_handler + (HBH_OPTION_TYPE_IOAM_PROOF_OF_TRANSIT, ip6_ioam_analyse_hbh_pot); + ip6_ioam_analyse_register_hbh_handler (HBH_OPTION_TYPE_IOAM_EDGE_TO_EDGE, + ip6_ioam_analyse_hbh_e2e_internal); +} + +void +ip6_ioam_analyse_unregister_handlers () +{ + ip6_ioam_analyse_unregister_hbh_handler + (HBH_OPTION_TYPE_IOAM_TRACE_DATA_LIST); + ip6_ioam_analyse_unregister_hbh_handler + (HBH_OPTION_TYPE_IOAM_PROOF_OF_TRANSIT); + ip6_ioam_analyse_unregister_hbh_handler (HBH_OPTION_TYPE_IOAM_EDGE_TO_EDGE); +} + +/* *INDENT-OFF* */ + +/* + * Node for IP6 analyse - packets + */ +VLIB_REGISTER_NODE (analyse_node_local) = { + .function = ip6_ioam_analyse_node_fn, + .name = "ip6-hbh-analyse-local", + .vector_size = sizeof (u32), + .format_trace = format_analyse_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN (analyse_error_strings), + .error_strings = analyse_error_strings, + .n_next_nodes = ANALYSE_N_NEXT, + /* edit / add dispositions here */ + .next_nodes = { + [ANALYSE_NEXT_IP4_LOOKUP] = "ip4-lookup", + [ANALYSE_NEXT_IP4_DROP] = "ip4-drop", + }, +}; + +/* + * Node for IP6 analyse - packets + */ +VLIB_REGISTER_NODE (analyse_node_remote) = +{ + .function = ip6_ioam_analyse_node_fn, + .name = "ip6-hbh-analyse-remote", + .vector_size = sizeof (u32), + .format_trace = format_analyse_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN (analyse_error_strings), + .error_strings = analyse_error_strings, + .n_next_nodes = ANALYSE_N_NEXT, + /* edit / add dispositions here */ + .next_nodes = { + [ANALYSE_NEXT_IP4_LOOKUP] = "ip4-lookup", + [ANALYSE_NEXT_IP4_DROP] = "ip4-drop", + }, +}; + +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/dir.dox b/src/plugins/ioam/dir.dox new file mode 100644 index 00000000..f3389b52 --- /dev/null +++ b/src/plugins/ioam/dir.dox @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + @dir + @brief Inband OAM (iOAM) implementation +*/ diff --git a/src/plugins/ioam/encap/ip6_ioam_e2e.c b/src/plugins/ioam/encap/ip6_ioam_e2e.c new file mode 100644 index 00000000..cdaf740d --- /dev/null +++ b/src/plugins/ioam/encap/ip6_ioam_e2e.c @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vlib/vlib.h> +#include <vnet/vnet.h> +#include <vnet/pg/pg.h> +#include <vppinfra/error.h> + +#include <vnet/ip/ip.h> + +#include <vppinfra/hash.h> +#include <vppinfra/error.h> +#include <vppinfra/elog.h> + +#include <vnet/ip/ip6_hop_by_hop.h> +#include "ip6_ioam_e2e.h" + +ioam_e2e_main_t ioam_e2e_main; + +static u8 * ioam_e2e_trace_handler (u8 * s, + ip6_hop_by_hop_option_t *opt) +{ + ioam_e2e_option_t * e2e = (ioam_e2e_option_t *)opt; + u32 seqno = 0; + + if (e2e) + { + seqno = clib_net_to_host_u32 (e2e->e2e_hdr.e2e_data); + } + + s = format (s, "SeqNo = 0x%Lx", seqno); + return s; +} + +int +ioam_e2e_config_handler (void *data, u8 disable) +{ + int *analyse = data; + + /* Register hanlders if enabled */ + if (!disable) + { + /* If encap node register for encap handler */ + if (0 == *analyse) + { + if (ip6_hbh_register_option(HBH_OPTION_TYPE_IOAM_EDGE_TO_EDGE, + ioam_seqno_encap_handler, + ioam_e2e_trace_handler) < 0) + { + return (-1); + } + } + /* If analyze node then register for decap handler */ + else + { + if (ip6_hbh_pop_register_option(HBH_OPTION_TYPE_IOAM_EDGE_TO_EDGE, + ioam_seqno_decap_handler) < 0) + { + return (-1); + } + } + return 0; + } + + /* UnRegister handlers */ + (void) ip6_hbh_unregister_option(HBH_OPTION_TYPE_IOAM_EDGE_TO_EDGE); + (void) ip6_hbh_pop_unregister_option(HBH_OPTION_TYPE_IOAM_EDGE_TO_EDGE); + return 0; +} + +int +ioam_e2e_rewrite_handler (u8 *rewrite_string, + u8 *rewrite_size) +{ + ioam_e2e_option_t *e2e_option; + + if (rewrite_string && *rewrite_size == sizeof(ioam_e2e_option_t)) + { + e2e_option = (ioam_e2e_option_t *)rewrite_string; + e2e_option->hdr.type = HBH_OPTION_TYPE_IOAM_EDGE_TO_EDGE + | HBH_OPTION_TYPE_SKIP_UNKNOWN; + e2e_option->hdr.length = sizeof (ioam_e2e_option_t) - + sizeof (ip6_hop_by_hop_option_t); + return(0); + } + return(-1); +} + +u32 +ioam_e2e_flow_handler (u32 ctx, u8 add) +{ + ioam_e2e_data_t *data; + u16 i; + + if (add) + { + pool_get(ioam_e2e_main.e2e_data, data); + data->flow_ctx = ctx; + ioam_seqno_init_data(&data->seqno_data); + return ((u32) (data - ioam_e2e_main.e2e_data)); + } + + /* Delete case */ + for (i = 0; i < vec_len(ioam_e2e_main.e2e_data); i++) + { + if (pool_is_free_index(ioam_e2e_main.e2e_data, i)) + continue; + + data = pool_elt_at_index(ioam_e2e_main.e2e_data, i); + if (data && (data->flow_ctx == ctx)) + { + pool_put_index(ioam_e2e_main.e2e_data, i); + return (0); + } + } + return 0; +} + +static clib_error_t * +ioam_show_e2e_cmd_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + ioam_e2e_data_t *e2e_data; + u8 *s = 0; + int i; + + vec_reset_length(s); + + s = format(0, "IOAM E2E information: \n"); + for (i = 0; i < vec_len(ioam_e2e_main.e2e_data); i++) + { + if (pool_is_free_index(ioam_e2e_main.e2e_data, i)) + continue; + + e2e_data = pool_elt_at_index(ioam_e2e_main.e2e_data, i); + s = format(s, "Flow name: %s\n", get_flow_name_from_flow_ctx(e2e_data->flow_ctx)); + + s = show_ioam_seqno_cmd_fn(s, + &e2e_data->seqno_data, + !IOAM_DEAP_ENABLED(e2e_data->flow_ctx)); + } + + vlib_cli_output(vm, "%v", s); + return 0; +} + + +VLIB_CLI_COMMAND (ioam_show_e2e_cmd, static) = { + .path = "show ioam e2e ", + .short_help = "show ioam e2e information", + .function = ioam_show_e2e_cmd_fn, +}; + +/* + * Init handler E2E headet handling. + * Init hanlder registers encap, decap, trace and Rewrite handlers. + */ +static clib_error_t * +ioam_e2e_init (vlib_main_t * vm) +{ + clib_error_t * error; + + if ((error = vlib_call_init_function (vm, ip6_hop_by_hop_ioam_init))) + { + return(error); + } + + /* + * As of now we have only PPC under E2E header. + */ + if (ip6_hbh_config_handler_register(HBH_OPTION_TYPE_IOAM_EDGE_TO_EDGE, + ioam_e2e_config_handler) < 0) + { + return (clib_error_create("Registration of " + "HBH_OPTION_TYPE_IOAM_EDGE_TO_EDGE for rewrite failed")); + } + + if (ip6_hbh_add_register_option(HBH_OPTION_TYPE_IOAM_EDGE_TO_EDGE, + sizeof(ioam_e2e_option_t), + ioam_e2e_rewrite_handler) < 0) + { + return (clib_error_create("Registration of " + "HBH_OPTION_TYPE_IOAM_EDGE_TO_EDGE for rewrite failed")); + } + + if (ip6_hbh_flow_handler_register(HBH_OPTION_TYPE_IOAM_EDGE_TO_EDGE, + ioam_e2e_flow_handler) < 0) + { + return (clib_error_create("Registration of " + "HBH_OPTION_TYPE_IOAM_EDGE_TO_EDGE Flow handler failed")); + } + + ioam_e2e_main.vlib_main = vm; + ioam_e2e_main.vnet_main = vnet_get_main(); + + return (0); +} + +/* + * Init function for the E2E lib. + * ip6_hop_by_hop_ioam_e2e_init gets called during init. + */ +VLIB_INIT_FUNCTION (ioam_e2e_init); diff --git a/src/plugins/ioam/encap/ip6_ioam_e2e.h b/src/plugins/ioam/encap/ip6_ioam_e2e.h new file mode 100644 index 00000000..fb83403d --- /dev/null +++ b/src/plugins/ioam/encap/ip6_ioam_e2e.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __included_ip6_ioam_e2e_h__ +#define __included_ip6_ioam_e2e_h__ + +#include <ioam/lib-e2e/e2e_util.h> +#include "ip6_ioam_seqno.h" + +/* *INDENT-OFF* */ +typedef CLIB_PACKED(struct { + ip6_hop_by_hop_option_t hdr; + ioam_e2e_packet_t e2e_hdr; +}) ioam_e2e_option_t; +/* *INDENT-ON* */ + +typedef struct ioam_e2e_data_t_ { + u32 flow_ctx; + u32 pad; + ioam_seqno_data seqno_data; +} ioam_e2e_data_t; + +typedef struct { + ioam_e2e_data_t *e2e_data; + vlib_main_t *vlib_main; + vnet_main_t *vnet_main; +} ioam_e2e_main_t; + +extern ioam_e2e_main_t ioam_e2e_main; + +static inline ioam_seqno_data * +ioam_e2ec_get_seqno_data_from_flow_ctx (u32 flow_ctx) +{ + ioam_e2e_data_t *data = NULL; + u32 index; + + index = get_flow_data_from_flow_ctx(flow_ctx, + HBH_OPTION_TYPE_IOAM_EDGE_TO_EDGE); + data = &ioam_e2e_main.e2e_data[index]; + return &(data->seqno_data); +} + +static inline u32 +ioam_e2e_get_cur_seqno_from_flow_ctx (u32 flow_ctx) +{ + ioam_seqno_data *data = NULL; + + data = ioam_e2ec_get_seqno_data_from_flow_ctx(flow_ctx); + return data->seq_num; +} + +#endif /* __included_ioam_e2e_h__ */ diff --git a/src/plugins/ioam/encap/ip6_ioam_pot.c b/src/plugins/ioam/encap/ip6_ioam_pot.c new file mode 100644 index 00000000..9a761233 --- /dev/null +++ b/src/plugins/ioam/encap/ip6_ioam_pot.c @@ -0,0 +1,265 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vlib/vlib.h> +#include <vnet/vnet.h> +#include <vnet/pg/pg.h> +#include <vppinfra/error.h> +#include <vnet/ip/ip6.h> + +#include <vppinfra/hash.h> +#include <vppinfra/error.h> +#include <vppinfra/elog.h> + +#include <ioam/encap/ip6_ioam_pot.h> +#include <ioam/lib-pot/pot_util.h> + +#define foreach_ip6_hop_by_hop_ioam_pot_stats \ + _(PROCESSED, "Pkts with ip6 hop-by-hop pot options") \ + _(PROFILE_MISS, "Pkts with ip6 hop-by-hop pot options but no profile set") \ + _(PASSED, "Pkts with POT in Policy") \ + _(FAILED, "Pkts with POT out of Policy") + +static char * ip6_hop_by_hop_ioam_pot_stats_strings[] = { +#define _(sym,string) string, + foreach_ip6_hop_by_hop_ioam_pot_stats +#undef _ +}; + +typedef enum { +#define _(sym,str) IP6_IOAM_POT_##sym, + foreach_ip6_hop_by_hop_ioam_pot_stats +#undef _ + IP6_IOAM_POT_N_STATS, +} ip6_ioam_pot_stats_t; + +typedef struct { + /* stats */ + u64 counters[ARRAY_LEN(ip6_hop_by_hop_ioam_pot_stats_strings)]; + + /* convenience */ + vlib_main_t * vlib_main; + vnet_main_t * vnet_main; +} ip6_hop_by_hop_ioam_pot_main_t; + +ip6_hop_by_hop_ioam_pot_main_t ip6_hop_by_hop_ioam_pot_main; + +always_inline void +ip6_ioam_stats_increment_counter (u32 counter_index, u64 increment) +{ + ip6_hop_by_hop_ioam_pot_main_t *hm = &ip6_hop_by_hop_ioam_pot_main; + + hm->counters[counter_index] += increment; +} + + +static u8 * format_ioam_pot (u8 * s, va_list * args) +{ + ioam_pot_option_t * pot0 = va_arg (*args, ioam_pot_option_t *); + u64 random, cumulative; + random = cumulative = 0; + if (pot0) + { + random = clib_net_to_host_u64 (pot0->random); + cumulative = clib_net_to_host_u64 (pot0->cumulative); + } + + s = format (s, "random = 0x%Lx, Cumulative = 0x%Lx, Index = 0x%x", + random, cumulative, pot0 ? pot0->reserved_profile_id : ~0); + return s; +} + +u8 * +ip6_hbh_ioam_proof_of_transit_trace_handler (u8 *s, ip6_hop_by_hop_option_t *opt) +{ + ioam_pot_option_t *pot; + + s = format (s, " POT opt present\n"); + pot = (ioam_pot_option_t *) opt; + s = format (s, " %U\n", format_ioam_pot, pot); + return (s); +} + +int +ip6_hbh_ioam_proof_of_transit_handler (vlib_buffer_t *b, + ip6_header_t *ip, + ip6_hop_by_hop_option_t *opt0) +{ + ioam_pot_option_t * pot0; + u64 random = 0, cumulative = 0; + int rv = 0; + u8 pot_profile_index; + pot_profile *pot_profile = 0, *new_profile = 0; + u8 pot_encap = 0; + + pot0 = (ioam_pot_option_t *) opt0; + pot_encap = (pot0->random == 0); + pot_profile_index = pot_profile_get_active_id(); + pot_profile = pot_profile_get_active(); + if (pot_encap && PREDICT_FALSE(!pot_profile)) + { + ip6_ioam_stats_increment_counter (IP6_IOAM_POT_PROFILE_MISS, 1); + return(-1); + } + if (pot_encap) + { + pot0->reserved_profile_id = + pot_profile_index & PROFILE_ID_MASK; + pot_profile_incr_usage_stats(pot_profile); + } + else + { /* Non encap node */ + if (PREDICT_FALSE(pot0->reserved_profile_id != + pot_profile_index || pot_profile == 0)) + { + /* New profile announced by encap node. */ + new_profile = + pot_profile_find(pot0->reserved_profile_id); + if (PREDICT_FALSE(new_profile == 0 || + new_profile->valid == 0)) + { + ip6_ioam_stats_increment_counter (IP6_IOAM_POT_PROFILE_MISS, 1); + return(-1); + } + else + { + pot_profile_index = pot0->reserved_profile_id; + pot_profile = new_profile; + pot_profile_set_active(pot_profile_index); + pot_profile_reset_usage_stats(pot_profile); + } + } + pot_profile_incr_usage_stats(pot_profile); + } + + if (pot0->random == 0) + { + pot0->random = clib_host_to_net_u64(pot_generate_random(pot_profile)); + pot0->cumulative = 0; + } + random = clib_net_to_host_u64(pot0->random); + cumulative = clib_net_to_host_u64(pot0->cumulative); + pot0->cumulative = clib_host_to_net_u64( + pot_update_cumulative(pot_profile, + cumulative, + random)); + ip6_ioam_stats_increment_counter (IP6_IOAM_POT_PROCESSED, 1); + + return (rv); +} + +int +ip6_hbh_ioam_proof_of_transit_pop_handler (vlib_buffer_t *b, ip6_header_t *ip, + ip6_hop_by_hop_option_t *opt0) +{ + ioam_pot_option_t * pot0; + u64 random = 0; + u64 cumulative = 0; + int rv = 0; + pot_profile *pot_profile = 0; + u8 result = 0; + + pot0 = (ioam_pot_option_t *) opt0; + random = clib_net_to_host_u64(pot0->random); + cumulative = clib_net_to_host_u64(pot0->cumulative); + pot_profile = pot_profile_get_active(); + result = pot_validate (pot_profile, + cumulative, random); + + if (result == 1) + { + ip6_ioam_stats_increment_counter (IP6_IOAM_POT_PASSED, 1); + } + else + { + ip6_ioam_stats_increment_counter (IP6_IOAM_POT_FAILED, 1); + } + return (rv); +} + +int ip6_hop_by_hop_ioam_pot_rewrite_handler (u8 *rewrite_string, u8 *rewrite_size) +{ + ioam_pot_option_t * pot_option; + if (rewrite_string && *rewrite_size == sizeof(ioam_pot_option_t)) + { + pot_option = (ioam_pot_option_t *)rewrite_string; + pot_option->hdr.type = HBH_OPTION_TYPE_IOAM_PROOF_OF_TRANSIT + | HBH_OPTION_TYPE_DATA_CHANGE_ENROUTE; + pot_option->hdr.length = sizeof (ioam_pot_option_t) - + sizeof (ip6_hop_by_hop_option_t); + return(0); + } + return(-1); +} + +static clib_error_t * +ip6_show_ioam_pot_cmd_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + ip6_hop_by_hop_ioam_pot_main_t *hm = &ip6_hop_by_hop_ioam_pot_main; + u8 *s = 0; + int i = 0; + + for ( i = 0; i < IP6_IOAM_POT_N_STATS; i++) + { + s = format(s, " %s - %lu\n", ip6_hop_by_hop_ioam_pot_stats_strings[i], + hm->counters[i]); + } + + vlib_cli_output(vm, "%v", s); + vec_free(s); + return 0; +} + + +VLIB_CLI_COMMAND (ip6_show_ioam_pot_cmd, static) = { + .path = "show ioam pot", + .short_help = "iOAM pot statistics", + .function = ip6_show_ioam_pot_cmd_fn, +}; + + +static clib_error_t * +ip6_hop_by_hop_ioam_pot_init (vlib_main_t * vm) +{ + ip6_hop_by_hop_ioam_pot_main_t * hm = &ip6_hop_by_hop_ioam_pot_main; + clib_error_t * error; + + if ((error = vlib_call_init_function (vm, ip6_hop_by_hop_ioam_init))) + return(error); + + hm->vlib_main = vm; + hm->vnet_main = vnet_get_main(); + memset(hm->counters, 0, sizeof(hm->counters)); + + if (ip6_hbh_register_option(HBH_OPTION_TYPE_IOAM_PROOF_OF_TRANSIT, ip6_hbh_ioam_proof_of_transit_handler, + ip6_hbh_ioam_proof_of_transit_trace_handler) < 0) + return (clib_error_create("registration of HBH_OPTION_TYPE_IOAM_PROOF_OF_TRANSIT failed")); + + if (ip6_hbh_add_register_option(HBH_OPTION_TYPE_IOAM_PROOF_OF_TRANSIT, + sizeof(ioam_pot_option_t), + ip6_hop_by_hop_ioam_pot_rewrite_handler) < 0) + return (clib_error_create("registration of HBH_OPTION_TYPE_IOAM_PROOF_OF_TRANSIT for rewrite failed")); + + if (ip6_hbh_pop_register_option(HBH_OPTION_TYPE_IOAM_PROOF_OF_TRANSIT, + ip6_hbh_ioam_proof_of_transit_pop_handler) < 0) + return (clib_error_create("registration of HBH_OPTION_TYPE_IOAM_PROOF_OF_TRANSIT POP failed")); + + return (0); +} + +VLIB_INIT_FUNCTION (ip6_hop_by_hop_ioam_pot_init); + + diff --git a/src/plugins/ioam/encap/ip6_ioam_pot.h b/src/plugins/ioam/encap/ip6_ioam_pot.h new file mode 100644 index 00000000..01ce4ac5 --- /dev/null +++ b/src/plugins/ioam/encap/ip6_ioam_pot.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PLUGINS_IOAM_PLUGIN_IOAM_ENCAP_IP6_IOAM_POT_H_ +#define PLUGINS_IOAM_PLUGIN_IOAM_ENCAP_IP6_IOAM_POT_H_ + +#include <vnet/ip/ip6_hop_by_hop_packet.h> + +/* *INDENT-OFF* */ +typedef CLIB_PACKED (struct { + ip6_hop_by_hop_option_t hdr; + u8 pot_type; + #define PROFILE_ID_MASK 0xF + u8 reserved_profile_id; /* 4 bits reserved, 4 bits to carry profile id */ + u64 random; + u64 cumulative; +}) ioam_pot_option_t; +/* *INDENT-ON* */ + +#endif /* PLUGINS_IOAM_PLUGIN_IOAM_ENCAP_IP6_IOAM_POT_H_ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/encap/ip6_ioam_seqno.c b/src/plugins/ioam/encap/ip6_ioam_seqno.c new file mode 100644 index 00000000..08bf554b --- /dev/null +++ b/src/plugins/ioam/encap/ip6_ioam_seqno.c @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#include <vlib/vlib.h> +#include <vnet/vnet.h> +#include <vnet/pg/pg.h> +#include <vppinfra/error.h> + +#include <vnet/ip/ip.h> + +#include <vppinfra/hash.h> +#include <vppinfra/error.h> +#include <vppinfra/elog.h> + +#include "ip6_ioam_seqno.h" +#include "ip6_ioam_e2e.h" + + +/* + * This Routine gets called from IPv6 hop-by-hop option handling. + * Only if we are encap node, then add PPC data. + * On a Transit(MID) node we dont do anything with E2E headers. + * On decap node decap is handled by seperate function. + */ +int +ioam_seqno_encap_handler (vlib_buffer_t *b, ip6_header_t *ip, + ip6_hop_by_hop_option_t *opt) +{ + u32 opaque_index = vnet_buffer(b)->l2_classify.opaque_index; + ioam_e2e_option_t * e2e; + int rv = 0; + ioam_seqno_data *data; + + /* Bypass seqno processing */ + if (PREDICT_FALSE(opaque_index == 0x7FFFFFFF)) + return rv; + + data = ioam_e2ec_get_seqno_data_from_flow_ctx(opaque_index); + e2e = (ioam_e2e_option_t *) opt; + e2e->e2e_hdr.e2e_data = clib_host_to_net_u32(++data->seq_num); + + return (rv); +} + +/* + * This Routine gets called on POP/Decap node. + */ +int +ioam_seqno_decap_handler (vlib_buffer_t *b, ip6_header_t *ip, + ip6_hop_by_hop_option_t *opt) +{ + u32 opaque_index = vnet_buffer(b)->l2_classify.opaque_index; + ioam_e2e_option_t * e2e; + int rv = 0; + ioam_seqno_data *data; + + data = ioam_e2ec_get_seqno_data_from_flow_ctx(opaque_index); + e2e = (ioam_e2e_option_t *) opt; + ioam_analyze_seqno(&data->seqno_rx, + (u64) clib_net_to_host_u32(e2e->e2e_hdr.e2e_data)); + + return (rv); +} diff --git a/src/plugins/ioam/encap/ip6_ioam_seqno.h b/src/plugins/ioam/encap/ip6_ioam_seqno.h new file mode 100644 index 00000000..5c140246 --- /dev/null +++ b/src/plugins/ioam/encap/ip6_ioam_seqno.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __included_ip6_ioam_seqno_h__ +#define __included_ip6_ioam_seqno_h__ + +#include <vnet/ip/ip6_packet.h> +#include <vnet/ip/ip6_hop_by_hop.h> +#include <ioam/lib-e2e/e2e_util.h> + +int ioam_seqno_encap_handler(vlib_buffer_t *b, ip6_header_t *ip, + ip6_hop_by_hop_option_t *opt); + +int +ioam_seqno_decap_handler(vlib_buffer_t *b, ip6_header_t *ip, + ip6_hop_by_hop_option_t *opt); + +#endif diff --git a/src/plugins/ioam/encap/ip6_ioam_trace.c b/src/plugins/ioam/encap/ip6_ioam_trace.c new file mode 100644 index 00000000..3ec3ea82 --- /dev/null +++ b/src/plugins/ioam/encap/ip6_ioam_trace.c @@ -0,0 +1,502 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vlib/vlib.h> +#include <vnet/vnet.h> +#include <vnet/pg/pg.h> +#include <vppinfra/error.h> +#include <vpp/app/version.h> + +#include <vnet/ip/ip6.h> +#include <vnet/ip/ip6_hop_by_hop.h> +#include <vnet/ip/ip6_hop_by_hop_packet.h> + +#include <vppinfra/hash.h> +#include <vppinfra/error.h> +#include <vppinfra/elog.h> +#include <vnet/plugin/plugin.h> + +#include <ioam/lib-trace/trace_util.h> +#include <ioam/lib-trace/trace_config.h> +#include <ioam/encap/ip6_ioam_trace.h> +#include <ioam/udp-ping/udp_ping.h> +#include <ioam/udp-ping/udp_ping_packet.h> +#include <ioam/udp-ping/udp_ping_util.h> + +/* Timestamp precision multipliers for seconds, milliseconds, microseconds + * and nanoseconds respectively. + */ +static f64 trace_tsp_mul[4] = { 1, 1e3, 1e6, 1e9 }; + +typedef union +{ + u64 as_u64; + u32 as_u32[2]; +} time_u64_t; + +extern ip6_hop_by_hop_ioam_main_t ip6_hop_by_hop_ioam_main; +extern ip6_main_t ip6_main; + +#define foreach_ip6_hop_by_hop_ioam_trace_stats \ + _(PROCESSED, "Pkts with ip6 hop-by-hop trace options") \ + _(PROFILE_MISS, "Pkts with ip6 hop-by-hop trace options but no profile set") \ + _(UPDATED, "Pkts with trace updated") \ + _(FULL, "Pkts with trace options but no space") \ + _(LOOPBACK, "Pkts with trace options Loopback") \ + _(LOOPBACK_REPLY, "Pkts with trace options Loopback Reply") + +static char *ip6_hop_by_hop_ioam_trace_stats_strings[] = { +#define _(sym,string) string, + foreach_ip6_hop_by_hop_ioam_trace_stats +#undef _ +}; + +typedef enum +{ +#define _(sym,str) IP6_IOAM_TRACE_##sym, + foreach_ip6_hop_by_hop_ioam_trace_stats +#undef _ + IP6_IOAM_TRACE_N_STATS, +} ip6_ioam_trace_stats_t; + + +typedef struct +{ + /* stats */ + u64 counters[ARRAY_LEN (ip6_hop_by_hop_ioam_trace_stats_strings)]; + + /* convenience */ + vlib_main_t *vlib_main; + vnet_main_t *vnet_main; +} ip6_hop_by_hop_ioam_trace_main_t; + +ip6_hop_by_hop_ioam_trace_main_t ip6_hop_by_hop_ioam_trace_main; + +always_inline void +ip6_ioam_trace_stats_increment_counter (u32 counter_index, u64 increment) +{ + ip6_hop_by_hop_ioam_trace_main_t *hm = &ip6_hop_by_hop_ioam_trace_main; + + hm->counters[counter_index] += increment; +} + + +static u8 * +format_ioam_data_list_element (u8 * s, va_list * args) +{ + u32 *elt = va_arg (*args, u32 *); + u8 *trace_type_p = va_arg (*args, u8 *); + u8 trace_type = *trace_type_p; + + + if (trace_type & BIT_TTL_NODEID) + { + u32 ttl_node_id_host_byte_order = clib_net_to_host_u32 (*elt); + s = format (s, "ttl 0x%x node id 0x%x ", + ttl_node_id_host_byte_order >> 24, + ttl_node_id_host_byte_order & 0x00FFFFFF); + + elt++; + } + + if (trace_type & BIT_ING_INTERFACE && trace_type & BIT_ING_INTERFACE) + { + u32 ingress_host_byte_order = clib_net_to_host_u32 (*elt); + s = format (s, "ingress 0x%x egress 0x%x ", + ingress_host_byte_order >> 16, + ingress_host_byte_order & 0xFFFF); + elt++; + } + + if (trace_type & BIT_TIMESTAMP) + { + u32 ts_in_host_byte_order = clib_net_to_host_u32 (*elt); + s = format (s, "ts 0x%x \n", ts_in_host_byte_order); + elt++; + } + + if (trace_type & BIT_APPDATA) + { + u32 appdata_in_host_byte_order = clib_net_to_host_u32 (*elt); + s = format (s, "app 0x%x ", appdata_in_host_byte_order); + elt++; + } + + return s; +} + + +int +ip6_ioam_trace_get_sizeof_handler (u32 * result) +{ + u16 size = 0; + u8 trace_data_size = 0; + trace_profile *profile = NULL; + + *result = 0; + + profile = trace_profile_find (); + + if (PREDICT_FALSE (!profile)) + { + ip6_ioam_trace_stats_increment_counter (IP6_IOAM_TRACE_PROFILE_MISS, 1); + return (-1); + } + + trace_data_size = fetch_trace_data_size (profile->trace_type); + if (PREDICT_FALSE (trace_data_size == 0)) + return VNET_API_ERROR_INVALID_VALUE; + + if (PREDICT_FALSE (profile->num_elts * trace_data_size > 254)) + return VNET_API_ERROR_INVALID_VALUE; + + size += + sizeof (ioam_trace_option_t) + (profile->num_elts * trace_data_size); + *result = size; + + return 0; +} + + + +int +ip6_hop_by_hop_ioam_trace_rewrite_handler (u8 * rewrite_string, + u8 * rewrite_size) +{ + ioam_trace_option_t *trace_option = NULL; + u8 trace_data_size = 0; + u8 trace_option_elts = 0; + trace_profile *profile = NULL; + + + profile = trace_profile_find (); + + if (PREDICT_FALSE (!profile)) + { + ip6_ioam_trace_stats_increment_counter (IP6_IOAM_TRACE_PROFILE_MISS, 1); + return (-1); + } + + if (PREDICT_FALSE (!rewrite_string)) + return -1; + + trace_option_elts = profile->num_elts; + trace_data_size = fetch_trace_data_size (profile->trace_type); + trace_option = (ioam_trace_option_t *) rewrite_string; + trace_option->hdr.type = HBH_OPTION_TYPE_IOAM_TRACE_DATA_LIST | + HBH_OPTION_TYPE_DATA_CHANGE_ENROUTE; + trace_option->hdr.length = 2 /*ioam_trace_type,data_list_elts_left */ + + trace_option_elts * trace_data_size; + trace_option->trace_hdr.ioam_trace_type = + profile->trace_type & TRACE_TYPE_MASK; + trace_option->trace_hdr.data_list_elts_left = trace_option_elts; + *rewrite_size = + sizeof (ioam_trace_option_t) + (trace_option_elts * trace_data_size); + + return 0; +} + +always_inline void +ip6_hbh_ioam_loopback_handler (vlib_buffer_t * b, ip6_header_t * ip, + ioam_trace_option_t * trace) +{ + u32 buf_index; + ip6_hop_by_hop_ioam_main_t *hm = &ip6_hop_by_hop_ioam_main; + vlib_buffer_t *b0; + vlib_frame_t *nf = 0; + u32 *to_next; + vlib_node_t *next_node; + ip6_header_t *ip6; + ip6_hop_by_hop_header_t *hbh; + ioam_trace_option_t *opt; + udp_ping_t *udp; + + next_node = vlib_get_node_by_name (hm->vlib_main, (u8 *) "ip6-lookup"); + nf = vlib_get_frame_to_node (hm->vlib_main, next_node->index); + nf->n_vectors = 0; + to_next = vlib_frame_vector_args (nf); + + b0 = vlib_buffer_copy (hm->vlib_main, b); + buf_index = vlib_get_buffer_index (hm->vlib_main, b0); + + vnet_buffer (b0)->sw_if_index[VLIB_RX] = 0; + vnet_buffer (b0)->sw_if_index[VLIB_TX] = ~0; + + ip6 = vlib_buffer_get_current (b0); + hbh = (ip6_hop_by_hop_header_t *) (ip6 + 1); + opt = (ioam_trace_option_t *) + ip6_hbh_get_option (hbh, HBH_OPTION_TYPE_IOAM_TRACE_DATA_LIST); + + udp = (udp_ping_t *) ((u8 *) hbh + ((hbh->length + 1) << 3)); + udp_ping_create_reply_from_probe_ip6 (ip6, hbh, udp); + ip6_hbh_ioam_trace_set_bit (opt, BIT_LOOPBACK_REPLY); + + *to_next = buf_index; + nf->n_vectors++; + to_next++; + + vlib_put_frame_to_node (hm->vlib_main, next_node->index, nf); + ip6_ioam_trace_stats_increment_counter (IP6_IOAM_TRACE_LOOPBACK, 1); +} + +int +ip6_hbh_ioam_trace_data_list_handler (vlib_buffer_t * b, ip6_header_t * ip, + ip6_hop_by_hop_option_t * opt) +{ + ip6_hop_by_hop_ioam_main_t *hm = &ip6_hop_by_hop_ioam_main; + u8 elt_index = 0; + ioam_trace_option_t *trace = (ioam_trace_option_t *) opt; + u32 adj_index = vnet_buffer (b)->ip.adj_index[VLIB_TX]; + ip_adjacency_t *adj = adj_get (adj_index); + time_u64_t time_u64; + u32 *elt; + int rv = 0; + trace_profile *profile = NULL; + + + profile = trace_profile_find (); + + if (PREDICT_FALSE (!profile)) + { + ip6_ioam_trace_stats_increment_counter (IP6_IOAM_TRACE_PROFILE_MISS, 1); + return (-1); + } + + /* Don't trace loopback reply packets */ + if (trace->trace_hdr.ioam_trace_type & BIT_LOOPBACK_REPLY) + { + ip6_ioam_trace_stats_increment_counter (IP6_IOAM_TRACE_LOOPBACK_REPLY, + 1); + return rv; + } + + time_u64.as_u64 = 0; + + if (PREDICT_TRUE (trace->trace_hdr.data_list_elts_left)) + { + trace->trace_hdr.data_list_elts_left--; + /* fetch_trace_data_size returns in bytes. Convert it to 4-bytes + * to skip to this node's location. + */ + elt_index = + trace->trace_hdr.data_list_elts_left * + fetch_trace_data_size (trace->trace_hdr.ioam_trace_type) / 4; + elt = &trace->trace_hdr.elts[elt_index]; + if (trace->trace_hdr.ioam_trace_type & BIT_TTL_NODEID) + { + *elt = + clib_host_to_net_u32 ((ip->hop_limit << 24) | profile->node_id); + elt++; + } + + if (trace->trace_hdr.ioam_trace_type & BIT_ING_INTERFACE) + { + *elt = + (vnet_buffer (b)->sw_if_index[VLIB_RX] & 0xFFFF) << 16 | + (adj->rewrite_header.sw_if_index & 0xFFFF); + *elt = clib_host_to_net_u32 (*elt); + elt++; + } + + if (trace->trace_hdr.ioam_trace_type & BIT_TIMESTAMP) + { + /* Send least significant 32 bits */ + f64 time_f64 = + (f64) (((f64) hm->unix_time_0) + + (vlib_time_now (hm->vlib_main) - hm->vlib_time_0)); + + time_u64.as_u64 = time_f64 * trace_tsp_mul[profile->trace_tsp]; + *elt = clib_host_to_net_u32 (time_u64.as_u32[0]); + elt++; + } + + if (trace->trace_hdr.ioam_trace_type & BIT_APPDATA) + { + /* $$$ set elt0->app_data */ + *elt = clib_host_to_net_u32 (profile->app_data); + elt++; + } + + + if (PREDICT_FALSE (trace->trace_hdr.ioam_trace_type & BIT_LOOPBACK)) + { + /* if loopback flag set then copy the packet + * and send it back to source */ + ip6_hbh_ioam_loopback_handler (b, ip, trace); + } + + ip6_ioam_trace_stats_increment_counter (IP6_IOAM_TRACE_UPDATED, 1); + } + else + { + ip6_ioam_trace_stats_increment_counter (IP6_IOAM_TRACE_FULL, 1); + } + return (rv); +} + +u8 * +ip6_hbh_ioam_trace_data_list_trace_handler (u8 * s, + ip6_hop_by_hop_option_t * opt) +{ + ioam_trace_option_t *trace; + u8 trace_data_size_in_words = 0; + u32 *elt; + int elt_index = 0; + + trace = (ioam_trace_option_t *) opt; + s = + format (s, " Trace Type 0x%x , %d elts left\n", + trace->trace_hdr.ioam_trace_type, + trace->trace_hdr.data_list_elts_left); + trace_data_size_in_words = + fetch_trace_data_size (trace->trace_hdr.ioam_trace_type) / 4; + elt = &trace->trace_hdr.elts[0]; + while ((u8 *) elt < + ((u8 *) (&trace->trace_hdr.elts[0]) + trace->hdr.length - 2 + /* -2 accounts for ioam_trace_type,elts_left */ )) + { + s = format (s, " [%d] %U\n", elt_index, + format_ioam_data_list_element, + elt, &trace->trace_hdr.ioam_trace_type); + elt_index++; + elt += trace_data_size_in_words; + } + return (s); +} + + +static clib_error_t * +ip6_show_ioam_trace_cmd_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + ip6_hop_by_hop_ioam_trace_main_t *hm = &ip6_hop_by_hop_ioam_trace_main; + u8 *s = 0; + int i = 0; + + for (i = 0; i < IP6_IOAM_TRACE_N_STATS; i++) + { + s = + format (s, " %s - %lu\n", ip6_hop_by_hop_ioam_trace_stats_strings[i], + hm->counters[i]); + } + + vlib_cli_output (vm, "%v", s); + vec_free (s); + return 0; +} + + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (ip6_show_ioam_trace_cmd, static) = { + .path = "show ioam trace", + .short_help = "iOAM trace statistics", + .function = ip6_show_ioam_trace_cmd_fn, +}; +/* *INDENT-ON* */ + +/* *INDENT-OFF* */ +VLIB_PLUGIN_REGISTER () = { + .version = VPP_BUILD_VER, + .description = "Inbound OAM", +}; +/* *INDENT-ON* */ + +static clib_error_t * +ip6_hop_by_hop_ioam_trace_init (vlib_main_t * vm) +{ + ip6_hop_by_hop_ioam_trace_main_t *hm = &ip6_hop_by_hop_ioam_trace_main; + clib_error_t *error; + + if ((error = vlib_call_init_function (vm, ip_main_init))) + return (error); + + if ((error = vlib_call_init_function (vm, ip6_lookup_init))) + return error; + + if ((error = vlib_call_init_function (vm, ip6_hop_by_hop_ioam_init))) + return (error); + + hm->vlib_main = vm; + hm->vnet_main = vnet_get_main (); + memset (hm->counters, 0, sizeof (hm->counters)); + + + if (ip6_hbh_register_option + (HBH_OPTION_TYPE_IOAM_TRACE_DATA_LIST, + ip6_hbh_ioam_trace_data_list_handler, + ip6_hbh_ioam_trace_data_list_trace_handler) < 0) + return (clib_error_create + ("registration of HBH_OPTION_TYPE_IOAM_TRACE_DATA_LIST failed")); + + + if (ip6_hbh_add_register_option (HBH_OPTION_TYPE_IOAM_TRACE_DATA_LIST, + sizeof (ioam_trace_option_t), + ip6_hop_by_hop_ioam_trace_rewrite_handler) + < 0) + return (clib_error_create + ("registration of HBH_OPTION_TYPE_IOAM_TRACE_DATA_LIST for rewrite failed")); + + + return (0); +} + +int +ip6_trace_profile_cleanup (void) +{ + ip6_hop_by_hop_ioam_main_t *hm = &ip6_hop_by_hop_ioam_main; + + hm->options_size[HBH_OPTION_TYPE_IOAM_TRACE_DATA_LIST] = 0; + + return 0; + +} + + +int +ip6_trace_profile_setup (void) +{ + u32 trace_size = 0; + ip6_hop_by_hop_ioam_main_t *hm = &ip6_hop_by_hop_ioam_main; + + trace_profile *profile = NULL; + + + profile = trace_profile_find (); + + if (PREDICT_FALSE (!profile)) + { + ip6_ioam_trace_stats_increment_counter (IP6_IOAM_TRACE_PROFILE_MISS, 1); + return (-1); + } + + + if (ip6_ioam_trace_get_sizeof_handler (&trace_size) < 0) + return (-1); + + hm->options_size[HBH_OPTION_TYPE_IOAM_TRACE_DATA_LIST] = trace_size; + + return (0); +} + + +VLIB_INIT_FUNCTION (ip6_hop_by_hop_ioam_trace_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/encap/ip6_ioam_trace.h b/src/plugins/ioam/encap/ip6_ioam_trace.h new file mode 100644 index 00000000..4eda6110 --- /dev/null +++ b/src/plugins/ioam/encap/ip6_ioam_trace.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * trace_util.h -- Trace Profile Utility header + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PLUGINS_IOAM_PLUGIN_IOAM_ENCAP_IP6_IOAM_TRACE_H_ +#define PLUGINS_IOAM_PLUGIN_IOAM_ENCAP_IP6_IOAM_TRACE_H_ + +#include <vnet/ip/ip6_hop_by_hop_packet.h> +#include <ioam/lib-trace/trace_util.h> + +/* *INDENT-OFF* */ +typedef CLIB_PACKED(struct { + ip6_hop_by_hop_option_t hdr; + ioam_trace_hdr_t trace_hdr; +}) ioam_trace_option_t; +/* *INDENT-ON* */ + +always_inline void +ip6_hbh_ioam_trace_set_bit (ioam_trace_option_t * trace, u8 trace_bit) +{ + ioam_trace_set_bit (&trace->trace_hdr, trace_bit); +} + +always_inline void +ip6_hbh_ioam_trace_reset_bit (ioam_trace_option_t * trace, u8 trace_bit) +{ + ioam_trace_reset_bit (&trace->trace_hdr, trace_bit); +} + +#endif /* PLUGINS_IOAM_PLUGIN_IOAM_ENCAP_IP6_IOAM_TRACE_H_ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/export-common/ioam_export.h b/src/plugins/ioam/export-common/ioam_export.h new file mode 100644 index 00000000..9de0d13b --- /dev/null +++ b/src/plugins/ioam/export-common/ioam_export.h @@ -0,0 +1,634 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_ioam_export_h__ +#define __included_ioam_export_h__ + +#include <vnet/vnet.h> +#include <vnet/ip/ip.h> +#include <vnet/ip/ip_packet.h> +#include <vnet/ip/ip4_packet.h> +#include <vnet/ip/ip6_packet.h> +#include <vnet/ip/ip6_hop_by_hop.h> +#include <vnet/udp/udp.h> +#include <vnet/flow/ipfix_packet.h> + +#include <vppinfra/pool.h> +#include <vppinfra/hash.h> +#include <vppinfra/error.h> +#include <vppinfra/elog.h> + +#include <vlib/threads.h> + +typedef struct ioam_export_buffer +{ + /* Allocated buffer */ + u32 buffer_index; + u64 touched_at; + u8 records_in_this_buffer; +} ioam_export_buffer_t; + + +typedef struct +{ + /* API message ID base */ + u16 msg_id_base; + u16 set_id; + + /* TODO: to support multiple collectors all this has to be grouped and create a vector here */ + u8 *record_header; + u32 sequence_number; + u32 domain_id; + + /* ipfix collector, our ip address */ + ip4_address_t ipfix_collector; + ip4_address_t src_address; + + /* Pool of ioam_export_buffer_t */ + ioam_export_buffer_t *buffer_pool; + /* Vector of per thread ioam_export_buffer_t to buffer pool index */ + u32 *buffer_per_thread; + /* Lock per thread to swap buffers between worker and timer process */ + volatile u32 **lockp; + + /* time scale transform */ + u32 unix_time_0; + f64 vlib_time_0; + + /* convenience */ + vlib_main_t *vlib_main; + vnet_main_t *vnet_main; + ethernet_main_t *ethernet_main; + u32 next_node_index; + + uword my_hbh_slot; + u32 export_process_node_index; +} ioam_export_main_t; + + +#define DEFAULT_EXPORT_SIZE (3 * CLIB_CACHE_LINE_BYTES) +/* + * Number of records in a buffer + * ~(MTU (1500) - [ip hdr(40) + UDP(8) + ipfix (24)]) / DEFAULT_EXPORT_SIZE + */ +#define DEFAULT_EXPORT_RECORDS 7 + +inline static void +ioam_export_set_next_node (ioam_export_main_t * em, u8 * next_node_name) +{ + vlib_node_t *next_node; + + next_node = vlib_get_node_by_name (em->vlib_main, next_node_name); + em->next_node_index = next_node->index; +} + +inline static void +ioam_export_reset_next_node (ioam_export_main_t * em) +{ + vlib_node_t *next_node; + + next_node = vlib_get_node_by_name (em->vlib_main, (u8 *) "ip4-lookup"); + em->next_node_index = next_node->index; +} + +always_inline ioam_export_buffer_t * +ioam_export_get_my_buffer (ioam_export_main_t * em, u32 thread_id) +{ + + if (vec_len (em->buffer_per_thread) > thread_id) + return (pool_elt_at_index + (em->buffer_pool, em->buffer_per_thread[thread_id])); + return (0); +} + +inline static int +ioam_export_buffer_add_header (ioam_export_main_t * em, vlib_buffer_t * b0) +{ + clib_memcpy (b0->data, em->record_header, vec_len (em->record_header)); + b0->current_data = 0; + b0->current_length = vec_len (em->record_header); + b0->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID; + return (1); +} + +inline static int +ioam_export_init_buffer (ioam_export_main_t * em, vlib_main_t * vm, + ioam_export_buffer_t * eb) +{ + vlib_buffer_t *b = 0; + + if (!eb) + return (-1); + /* TODO: Perhaps buffer init from template here */ + if (vlib_buffer_alloc (vm, &(eb->buffer_index), 1) != 1) + return (-2); + eb->records_in_this_buffer = 0; + eb->touched_at = vlib_time_now (vm); + b = vlib_get_buffer (vm, eb->buffer_index); + (void) ioam_export_buffer_add_header (em, b); + vnet_buffer (b)->sw_if_index[VLIB_RX] = 0; + vnet_buffer (b)->sw_if_index[VLIB_TX] = ~0; + return (1); +} + +inline static void +ioam_export_thread_buffer_free (ioam_export_main_t * em) +{ + vlib_main_t *vm = em->vlib_main; + ioam_export_buffer_t *eb = 0; + int i; + for (i = 0; i < vec_len (em->buffer_per_thread); i++) + { + eb = pool_elt_at_index (em->buffer_pool, em->buffer_per_thread[i]); + if (eb) + vlib_buffer_free (vm, &(eb->buffer_index), 1); + } + for (i = 0; i < vec_len (em->lockp); i++) + clib_mem_free ((void *) em->lockp[i]); + vec_free (em->buffer_per_thread); + pool_free (em->buffer_pool); + vec_free (em->lockp); + em->buffer_per_thread = 0; + em->buffer_pool = 0; + em->lockp = 0; +} + +inline static int +ioam_export_thread_buffer_init (ioam_export_main_t * em, vlib_main_t * vm) +{ + int no_of_threads = vec_len (vlib_worker_threads); + int i; + ioam_export_buffer_t *eb = 0; + + pool_alloc_aligned (em->buffer_pool, + no_of_threads - 1, CLIB_CACHE_LINE_BYTES); + vec_validate_aligned (em->buffer_per_thread, + no_of_threads - 1, CLIB_CACHE_LINE_BYTES); + vec_validate_aligned (em->lockp, no_of_threads - 1, CLIB_CACHE_LINE_BYTES); + + if (!em->buffer_per_thread || !em->buffer_pool || !em->lockp) + { + return (-1); + } + for (i = 0; i < no_of_threads; i++) + { + eb = 0; + pool_get_aligned (em->buffer_pool, eb, CLIB_CACHE_LINE_BYTES); + memset (eb, 0, sizeof (*eb)); + em->buffer_per_thread[i] = eb - em->buffer_pool; + if (ioam_export_init_buffer (em, vm, eb) != 1) + { + ioam_export_thread_buffer_free (em); + return (-2); + } + em->lockp[i] = clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES, + CLIB_CACHE_LINE_BYTES); + memset ((void *) em->lockp[i], 0, CLIB_CACHE_LINE_BYTES); + } + return (1); +} + +#define IPFIX_IOAM_EXPORT_ID 272 +#define IPFIX_VXLAN_IOAM_EXPORT_ID 273 + +/* Used to build the rewrite */ +/* data set packet */ +typedef struct +{ + ipfix_message_header_t h; + ipfix_set_header_t s; +} ipfix_data_packet_t; + +typedef struct +{ + ip4_header_t ip4; + udp_header_t udp; + ipfix_data_packet_t ipfix; +} ip4_ipfix_data_packet_t; + + +inline static void +ioam_export_header_cleanup (ioam_export_main_t * em, + ip4_address_t * collector_address, + ip4_address_t * src_address) +{ + vec_free (em->record_header); + em->record_header = 0; +} + +inline static int +ioam_export_header_create (ioam_export_main_t * em, + ip4_address_t * collector_address, + ip4_address_t * src_address) +{ + ip4_header_t *ip; + udp_header_t *udp; + ipfix_message_header_t *h; + ipfix_set_header_t *s; + u8 *rewrite = 0; + ip4_ipfix_data_packet_t *tp; + + + /* allocate rewrite space */ + vec_validate_aligned (rewrite, + sizeof (ip4_ipfix_data_packet_t) - 1, + CLIB_CACHE_LINE_BYTES); + + tp = (ip4_ipfix_data_packet_t *) rewrite; + ip = (ip4_header_t *) & tp->ip4; + udp = (udp_header_t *) (ip + 1); + h = (ipfix_message_header_t *) (udp + 1); + s = (ipfix_set_header_t *) (h + 1); + + ip->ip_version_and_header_length = 0x45; + ip->ttl = 254; + ip->protocol = IP_PROTOCOL_UDP; + ip->src_address.as_u32 = src_address->as_u32; + ip->dst_address.as_u32 = collector_address->as_u32; + udp->src_port = clib_host_to_net_u16 (UDP_DST_PORT_ipfix); + udp->dst_port = clib_host_to_net_u16 (UDP_DST_PORT_ipfix); + /* FIXUP: UDP length */ + udp->length = clib_host_to_net_u16 (vec_len (rewrite) + + (DEFAULT_EXPORT_RECORDS * + DEFAULT_EXPORT_SIZE) - sizeof (*ip)); + + /* FIXUP: message header export_time */ + /* FIXUP: message header sequence_number */ + h->domain_id = clib_host_to_net_u32 (em->domain_id); + + /*FIXUP: Setid length in octets if records exported are not default */ + s->set_id_length = ipfix_set_id_length (em->set_id, + (sizeof (*s) + + (DEFAULT_EXPORT_RECORDS * + DEFAULT_EXPORT_SIZE))); + + /* FIXUP: h version and length length in octets if records exported are not default */ + h->version_length = version_length (sizeof (*h) + + (sizeof (*s) + + (DEFAULT_EXPORT_RECORDS * + DEFAULT_EXPORT_SIZE))); + + /* FIXUP: ip length if records exported are not default */ + /* FIXUP: ip checksum if records exported are not default */ + ip->length = clib_host_to_net_u16 (vec_len (rewrite) + + (DEFAULT_EXPORT_RECORDS * + DEFAULT_EXPORT_SIZE)); + ip->checksum = ip4_header_checksum (ip); + _vec_len (rewrite) = sizeof (ip4_ipfix_data_packet_t); + em->record_header = rewrite; + return (1); +} + +inline static int +ioam_export_send_buffer (ioam_export_main_t * em, vlib_main_t * vm, + ioam_export_buffer_t * eb) +{ + ip4_header_t *ip; + udp_header_t *udp; + ipfix_message_header_t *h; + ipfix_set_header_t *s; + ip4_ipfix_data_packet_t *tp; + vlib_buffer_t *b0; + u16 new_l0, old_l0; + ip_csum_t sum0; + vlib_frame_t *nf = 0; + u32 *to_next; + + b0 = vlib_get_buffer (vm, eb->buffer_index); + tp = vlib_buffer_get_current (b0); + ip = (ip4_header_t *) & tp->ip4; + udp = (udp_header_t *) (ip + 1); + h = (ipfix_message_header_t *) (udp + 1); + s = (ipfix_set_header_t *) (h + 1); + + /* FIXUP: message header export_time */ + h->export_time = clib_host_to_net_u32 ((u32) + (((f64) em->unix_time_0) + + (vlib_time_now (em->vlib_main) - + em->vlib_time_0))); + + /* FIXUP: message header sequence_number */ + h->sequence_number = clib_host_to_net_u32 (em->sequence_number++); + + /* FIXUP: lengths if different from default */ + if (PREDICT_FALSE (eb->records_in_this_buffer != DEFAULT_EXPORT_RECORDS)) + { + s->set_id_length = ipfix_set_id_length (em->set_id /* set_id */ , + b0->current_length - + (sizeof (*ip) + sizeof (*udp) + + sizeof (*h))); + h->version_length = + version_length (b0->current_length - (sizeof (*ip) + sizeof (*udp))); + sum0 = ip->checksum; + old_l0 = ip->length; + new_l0 = clib_host_to_net_u16 ((u16) b0->current_length); + sum0 = ip_csum_update (sum0, old_l0, new_l0, ip4_header_t, + length /* changed member */ ); + ip->checksum = ip_csum_fold (sum0); + ip->length = new_l0; + udp->length = clib_host_to_net_u16 (b0->current_length - sizeof (*ip)); + } + + /* Enqueue pkts to ip4-lookup */ + + nf = vlib_get_frame_to_node (vm, em->next_node_index); + nf->n_vectors = 0; + to_next = vlib_frame_vector_args (nf); + nf->n_vectors = 1; + to_next[0] = eb->buffer_index; + vlib_put_frame_to_node (vm, em->next_node_index, nf); + return (1); + +} + +#define EXPORT_TIMEOUT (20.0) +#define THREAD_PERIOD (30.0) +inline static uword +ioam_export_process_common (ioam_export_main_t * em, vlib_main_t * vm, + vlib_node_runtime_t * rt, vlib_frame_t * f, + u32 index) +{ + f64 now; + f64 timeout = 30.0; + uword event_type; + uword *event_data = 0; + int i; + ioam_export_buffer_t *eb = 0, *new_eb = 0; + u32 *vec_buffer_indices = 0; + u32 *vec_buffer_to_be_sent = 0; + u32 *thread_index = 0; + u32 new_pool_index = 0; + + em->export_process_node_index = index; + /* Wait for Godot... */ + vlib_process_wait_for_event_or_clock (vm, 1e9); + event_type = vlib_process_get_events (vm, &event_data); + if (event_type != 1) + clib_warning ("bogus kickoff event received, %d", event_type); + vec_reset_length (event_data); + + while (1) + { + vlib_process_wait_for_event_or_clock (vm, timeout); + event_type = vlib_process_get_events (vm, &event_data); + switch (event_type) + { + case 2: /* Stop and Wait for kickoff again */ + timeout = 1e9; + break; + case 1: /* kickoff : Check for unsent buffers */ + timeout = THREAD_PERIOD; + break; + case ~0: /* timeout */ + break; + } + vec_reset_length (event_data); + now = vlib_time_now (vm); + /* + * Create buffers for threads that are not active enough + * to send out the export records + */ + for (i = 0; i < vec_len (em->buffer_per_thread); i++) + { + /* If the worker thread is processing export records ignore further checks */ + if (*em->lockp[i] == 1) + continue; + eb = pool_elt_at_index (em->buffer_pool, em->buffer_per_thread[i]); + if (eb->records_in_this_buffer > 0 + && now > (eb->touched_at + EXPORT_TIMEOUT)) + { + pool_get_aligned (em->buffer_pool, new_eb, + CLIB_CACHE_LINE_BYTES); + memset (new_eb, 0, sizeof (*new_eb)); + if (ioam_export_init_buffer (em, vm, new_eb) == 1) + { + new_pool_index = new_eb - em->buffer_pool; + vec_add (vec_buffer_indices, &new_pool_index, 1); + vec_add (vec_buffer_to_be_sent, &em->buffer_per_thread[i], + 1); + vec_add (thread_index, &i, 1); + } + else + { + pool_put (em->buffer_pool, new_eb); + /*Give up */ + goto CLEANUP; + } + } + } + if (vec_len (thread_index) != 0) + { + /* + * Now swap the buffers out + */ + for (i = 0; i < vec_len (thread_index); i++) + { + while (__sync_lock_test_and_set (em->lockp[thread_index[i]], 1)) + ; + em->buffer_per_thread[thread_index[i]] = + vec_pop (vec_buffer_indices); + *em->lockp[thread_index[i]] = 0; + } + + /* Send the buffers */ + for (i = 0; i < vec_len (vec_buffer_to_be_sent); i++) + { + eb = + pool_elt_at_index (em->buffer_pool, vec_buffer_to_be_sent[i]); + ioam_export_send_buffer (em, vm, eb); + pool_put (em->buffer_pool, eb); + } + } + + CLEANUP: + /* Free any leftover/unused buffers and everything that was allocated */ + for (i = 0; i < vec_len (vec_buffer_indices); i++) + { + new_eb = pool_elt_at_index (em->buffer_pool, vec_buffer_indices[i]); + vlib_buffer_free (vm, &new_eb->buffer_index, 1); + pool_put (em->buffer_pool, new_eb); + } + vec_free (vec_buffer_indices); + vec_free (vec_buffer_to_be_sent); + vec_free (thread_index); + } + return 0; /* not so much */ +} + +#define ioam_export_node_common(EM, VM, N, F, HTYPE, L, V, NEXT, FIXUP_FUNC) \ +do { \ + u32 n_left_from, *from, *to_next; \ + export_next_t next_index; \ + u32 pkts_recorded = 0; \ + ioam_export_buffer_t *my_buf = 0; \ + vlib_buffer_t *eb0 = 0; \ + u32 ebi0 = 0; \ + from = vlib_frame_vector_args (F); \ + n_left_from = (F)->n_vectors; \ + next_index = (N)->cached_next_index; \ + while (__sync_lock_test_and_set ((EM)->lockp[(VM)->thread_index], 1)); \ + my_buf = ioam_export_get_my_buffer (EM, (VM)->thread_index); \ + my_buf->touched_at = vlib_time_now (VM); \ + while (n_left_from > 0) \ + { \ + u32 n_left_to_next; \ + vlib_get_next_frame (VM, N, next_index, to_next, n_left_to_next); \ + while (n_left_from >= 4 && n_left_to_next >= 2) \ + { \ + u32 next0 = NEXT; \ + u32 next1 = NEXT; \ + u32 bi0, bi1; \ + HTYPE *ip0, *ip1; \ + vlib_buffer_t *p0, *p1; \ + u32 ip_len0, ip_len1; \ + { \ + vlib_buffer_t *p2, *p3; \ + p2 = vlib_get_buffer (VM, from[2]); \ + p3 = vlib_get_buffer (VM, from[3]); \ + vlib_prefetch_buffer_header (p2, LOAD); \ + vlib_prefetch_buffer_header (p3, LOAD); \ + CLIB_PREFETCH (p2->data, 3 * CLIB_CACHE_LINE_BYTES, LOAD); \ + CLIB_PREFETCH (p3->data, 3 * CLIB_CACHE_LINE_BYTES, LOAD); \ + } \ + to_next[0] = bi0 = from[0]; \ + to_next[1] = bi1 = from[1]; \ + from += 2; \ + to_next += 2; \ + n_left_from -= 2; \ + n_left_to_next -= 2; \ + p0 = vlib_get_buffer (VM, bi0); \ + p1 = vlib_get_buffer (VM, bi1); \ + ip0 = vlib_buffer_get_current (p0); \ + ip1 = vlib_buffer_get_current (p1); \ + ip_len0 = \ + clib_net_to_host_u16 (ip0->L) + sizeof (HTYPE); \ + ip_len1 = \ + clib_net_to_host_u16 (ip1->L) + sizeof (HTYPE); \ + ebi0 = my_buf->buffer_index; \ + eb0 = vlib_get_buffer (VM, ebi0); \ + if (PREDICT_FALSE (eb0 == 0)) \ + goto NO_BUFFER1; \ + ip_len0 = \ + ip_len0 > DEFAULT_EXPORT_SIZE ? DEFAULT_EXPORT_SIZE : ip_len0; \ + ip_len1 = \ + ip_len1 > DEFAULT_EXPORT_SIZE ? DEFAULT_EXPORT_SIZE : ip_len1; \ + copy3cachelines (eb0->data + eb0->current_length, ip0, ip_len0); \ + FIXUP_FUNC(eb0, p0); \ + eb0->current_length += DEFAULT_EXPORT_SIZE; \ + my_buf->records_in_this_buffer++; \ + if (my_buf->records_in_this_buffer >= DEFAULT_EXPORT_RECORDS) \ + { \ + ioam_export_send_buffer (EM, VM, my_buf); \ + ioam_export_init_buffer (EM, VM, my_buf); \ + } \ + ebi0 = my_buf->buffer_index; \ + eb0 = vlib_get_buffer (VM, ebi0); \ + if (PREDICT_FALSE (eb0 == 0)) \ + goto NO_BUFFER1; \ + copy3cachelines (eb0->data + eb0->current_length, ip1, ip_len1); \ + FIXUP_FUNC(eb0, p1); \ + eb0->current_length += DEFAULT_EXPORT_SIZE; \ + my_buf->records_in_this_buffer++; \ + if (my_buf->records_in_this_buffer >= DEFAULT_EXPORT_RECORDS) \ + { \ + ioam_export_send_buffer (EM, VM, my_buf); \ + ioam_export_init_buffer (EM, VM, my_buf); \ + } \ + pkts_recorded += 2; \ + if (PREDICT_FALSE (((node)->flags & VLIB_NODE_FLAG_TRACE))) \ + { \ + if (p0->flags & VLIB_BUFFER_IS_TRACED) \ + { \ + export_trace_t *t = \ + vlib_add_trace (VM, node, p0, sizeof (*t)); \ + t->flow_label = \ + clib_net_to_host_u32 (ip0->V); \ + t->next_index = next0; \ + } \ + if (p1->flags & VLIB_BUFFER_IS_TRACED) \ + { \ + export_trace_t *t = \ + vlib_add_trace (VM, N, p1, sizeof (*t)); \ + t->flow_label = \ + clib_net_to_host_u32 (ip1->V); \ + t->next_index = next1; \ + } \ + } \ + NO_BUFFER1: \ + vlib_validate_buffer_enqueue_x2 (VM, N, next_index, \ + to_next, n_left_to_next, \ + bi0, bi1, next0, next1); \ + } \ + while (n_left_from > 0 && n_left_to_next > 0) \ + { \ + u32 bi0; \ + vlib_buffer_t *p0; \ + u32 next0 = NEXT; \ + HTYPE *ip0; \ + u32 ip_len0; \ + bi0 = from[0]; \ + to_next[0] = bi0; \ + from += 1; \ + to_next += 1; \ + n_left_from -= 1; \ + n_left_to_next -= 1; \ + p0 = vlib_get_buffer (VM, bi0); \ + ip0 = vlib_buffer_get_current (p0); \ + ip_len0 = \ + clib_net_to_host_u16 (ip0->L) + sizeof (HTYPE); \ + ebi0 = my_buf->buffer_index; \ + eb0 = vlib_get_buffer (VM, ebi0); \ + if (PREDICT_FALSE (eb0 == 0)) \ + goto NO_BUFFER; \ + ip_len0 = \ + ip_len0 > DEFAULT_EXPORT_SIZE ? DEFAULT_EXPORT_SIZE : ip_len0; \ + copy3cachelines (eb0->data + eb0->current_length, ip0, ip_len0); \ + FIXUP_FUNC(eb0, p0); \ + eb0->current_length += DEFAULT_EXPORT_SIZE; \ + my_buf->records_in_this_buffer++; \ + if (my_buf->records_in_this_buffer >= DEFAULT_EXPORT_RECORDS) \ + { \ + ioam_export_send_buffer (EM, VM, my_buf); \ + ioam_export_init_buffer (EM, VM, my_buf); \ + } \ + if (PREDICT_FALSE (((N)->flags & VLIB_NODE_FLAG_TRACE) \ + && (p0->flags & VLIB_BUFFER_IS_TRACED))) \ + { \ + export_trace_t *t = vlib_add_trace (VM, (N), p0, sizeof (*t)); \ + t->flow_label = \ + clib_net_to_host_u32 (ip0->V); \ + t->next_index = next0; \ + } \ + pkts_recorded += 1; \ + NO_BUFFER: \ + vlib_validate_buffer_enqueue_x1 (VM, N, next_index, \ + to_next, n_left_to_next, \ + bi0, next0); \ + } \ + vlib_put_next_frame (VM, N, next_index, n_left_to_next); \ + } \ + vlib_node_increment_counter (VM, export_node.index, \ + EXPORT_ERROR_RECORDED, pkts_recorded); \ + *(EM)->lockp[(VM)->thread_index] = 0; \ +} while(0) + +#endif /* __included_ioam_export_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/export-vxlan-gpe/vxlan_gpe_ioam_export.api b/src/plugins/ioam/export-vxlan-gpe/vxlan_gpe_ioam_export.api new file mode 100644 index 00000000..caa97e6e --- /dev/null +++ b/src/plugins/ioam/export-vxlan-gpe/vxlan_gpe_ioam_export.api @@ -0,0 +1,34 @@ +/* Hey Emacs use -*- mode: C -*- */ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Define a simple binary API to control the feature */ + +autoreply define vxlan_gpe_ioam_export_enable_disable { + /* Client identifier, set from api_main.my_client_index */ + u32 client_index; + + /* Arbitrary context, so client can match reply to request */ + u32 context; + + /* Enable / disable the feature */ + u8 is_disable; + + /* Collector ip address */ + u8 collector_address[4]; + u8 src_address[4]; + + /* Src ip address */ +}; diff --git a/src/plugins/ioam/export-vxlan-gpe/vxlan_gpe_ioam_export.c b/src/plugins/ioam/export-vxlan-gpe/vxlan_gpe_ioam_export.c new file mode 100644 index 00000000..ec43e484 --- /dev/null +++ b/src/plugins/ioam/export-vxlan-gpe/vxlan_gpe_ioam_export.c @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + *------------------------------------------------------------------ + * vxlan_gpe_ioam_export.c - ioam export API / debug CLI handling + *------------------------------------------------------------------ + */ + +#include <vnet/vnet.h> +#include <vnet/plugin/plugin.h> +#include <ioam/export-common/ioam_export.h> +#include <vnet/vxlan-gpe/vxlan_gpe.h> + +#include <vlibapi/api.h> +#include <vlibmemory/api.h> +#include <vlibsocket/api.h> + +#include <ioam/lib-vxlan-gpe/vxlan_gpe_ioam.h> + +/* define message IDs */ +#include <ioam/export-vxlan-gpe/vxlan_gpe_ioam_export_msg_enum.h> + +/* define message structures */ +#define vl_typedefs +#include <ioam/export-vxlan-gpe/vxlan_gpe_ioam_export_all_api_h.h> +#undef vl_typedefs + +/* define generated endian-swappers */ +#define vl_endianfun +#include <ioam/export-vxlan-gpe/vxlan_gpe_ioam_export_all_api_h.h> +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__) +#define vl_printfun +#include <ioam/export-vxlan-gpe/vxlan_gpe_ioam_export_all_api_h.h> +#undef vl_printfun + +/* Get the API version number */ +#define vl_api_version(n,v) static u32 api_version=(v); +#include <ioam/export-vxlan-gpe/vxlan_gpe_ioam_export_all_api_h.h> +#undef vl_api_version + +#define REPLY_MSG_ID_BASE sm->msg_id_base +#include <vlibapi/api_helper_macros.h> + +/* List of message types that this plugin understands */ +#define foreach_vxlan_gpe_ioam_export_plugin_api_msg \ +_(VXLAN_GPE_IOAM_EXPORT_ENABLE_DISABLE, vxlan_gpe_ioam_export_enable_disable) + +ioam_export_main_t vxlan_gpe_ioam_export_main; +extern vlib_node_registration_t vxlan_export_node; + +extern void vxlan_gpe_set_next_override (uword next); +/* Action function shared between message handler and debug CLI */ +int +vxlan_gpe_ioam_export_enable_disable (ioam_export_main_t * em, + u8 is_disable, + ip4_address_t * collector_address, + ip4_address_t * src_address) +{ + vlib_main_t *vm = em->vlib_main; + u32 node_index = vxlan_export_node.index; + vlib_node_t *vxlan_gpe_decap_ioam_node = NULL; + + if (is_disable == 0) + { + if (em->my_hbh_slot == ~0) + { + /* Hook this export node to vxlan-gpe-decap-ioam-v4 */ + vxlan_gpe_decap_ioam_node = + vlib_get_node_by_name (vm, (u8 *) "vxlan-gpe-decap-ioam-v4"); + if (!vxlan_gpe_decap_ioam_node) + { + /* node does not exist give up */ + return (-1); + } + em->my_hbh_slot = + vlib_node_add_next (vm, vxlan_gpe_decap_ioam_node->index, + node_index); + } + if (1 == ioam_export_header_create (em, collector_address, src_address)) + { + ioam_export_thread_buffer_init (em, vm); + vxlan_gpe_set_next_override (em->my_hbh_slot); + /* Turn on the export buffer check process */ + vlib_process_signal_event (vm, em->export_process_node_index, 1, 0); + + } + else + { + return (-2); + } + } + else + { + vxlan_gpe_set_next_override (VXLAN_GPE_DECAP_IOAM_V4_NEXT_POP); + ioam_export_header_cleanup (em, collector_address, src_address); + ioam_export_thread_buffer_free (em); + /* Turn off the export buffer check process */ + vlib_process_signal_event (vm, em->export_process_node_index, 2, 0); + + } + + return 0; +} + +/* API message handler */ +static void vl_api_vxlan_gpe_ioam_export_enable_disable_t_handler + (vl_api_vxlan_gpe_ioam_export_enable_disable_t * mp) +{ + vl_api_vxlan_gpe_ioam_export_enable_disable_reply_t *rmp; + ioam_export_main_t *sm = &vxlan_gpe_ioam_export_main; + int rv; + + rv = vxlan_gpe_ioam_export_enable_disable (sm, (int) (mp->is_disable), + (ip4_address_t *) + mp->collector_address, + (ip4_address_t *) + mp->src_address); + + REPLY_MACRO (VL_API_VXLAN_GPE_IOAM_EXPORT_ENABLE_DISABLE_REPLY); +} /* API message handler */ + + + +/* Set up the API message handling tables */ +static clib_error_t * +vxlan_gpe_ioam_export_plugin_api_hookup (vlib_main_t * vm) +{ + ioam_export_main_t *sm = &vxlan_gpe_ioam_export_main; +#define _(N,n) \ + vl_msg_api_set_handlers((VL_API_##N + sm->msg_id_base), \ + #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_vxlan_gpe_ioam_export_plugin_api_msg; +#undef _ + + return 0; +} + +#define vl_msg_name_crc_list +#include <ioam/export-vxlan-gpe/vxlan_gpe_ioam_export_all_api_h.h> +#undef vl_msg_name_crc_list + +static void +setup_message_id_table (ioam_export_main_t * sm, api_main_t * am) +{ +#define _(id,n,crc) \ + vl_msg_api_add_msg_name_crc (am, #n "_" #crc, id + sm->msg_id_base); + foreach_vl_msg_name_crc_vxlan_gpe_ioam_export; +#undef _ +} + + +static clib_error_t * +set_vxlan_gpe_ioam_export_ipfix_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + ioam_export_main_t *em = &vxlan_gpe_ioam_export_main; + ip4_address_t collector, src; + u8 is_disable = 0; + + collector.as_u32 = 0; + src.as_u32 = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "collector %U", unformat_ip4_address, &collector)) + ; + else if (unformat (input, "src %U", unformat_ip4_address, &src)) + ; + else if (unformat (input, "disable")) + is_disable = 1; + else + break; + } + + if (collector.as_u32 == 0) + return clib_error_return (0, "collector address required"); + + if (src.as_u32 == 0) + return clib_error_return (0, "src address required"); + + em->ipfix_collector.as_u32 = collector.as_u32; + em->src_address.as_u32 = src.as_u32; + + vlib_cli_output (vm, "Collector %U, src address %U", + format_ip4_address, &em->ipfix_collector, + format_ip4_address, &em->src_address); + + /* Turn on the export timer process */ + // vlib_process_signal_event (vm, flow_report_process_node.index, + //1, 0); + if (0 != + vxlan_gpe_ioam_export_enable_disable (em, is_disable, &collector, &src)) + { + return clib_error_return (0, "Unable to set ioam vxlan-gpe export"); + } + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (set_vxlan_gpe_ioam_ipfix_command, static) = +{ +.path = "set vxlan-gpe-ioam export ipfix", +.short_help = "set vxlan-gpe-ioam export ipfix collector <ip4-address> src <ip4-address>", +.function = set_vxlan_gpe_ioam_export_ipfix_command_fn, +}; +/* *INDENT-ON* */ + + +static clib_error_t * +vxlan_gpe_ioam_export_init (vlib_main_t * vm) +{ + ioam_export_main_t *em = &vxlan_gpe_ioam_export_main; + clib_error_t *error = 0; + u8 *name; + + em->set_id = IPFIX_VXLAN_IOAM_EXPORT_ID; + + name = format (0, "vxlan_gpe_ioam_export_%08x%c", api_version, 0); + + /* Ask for a correctly-sized block of API message decode slots */ + em->msg_id_base = vl_msg_api_get_msg_ids + ((char *) name, VL_MSG_FIRST_AVAILABLE); + em->unix_time_0 = (u32) time (0); /* Store starting time */ + em->vlib_time_0 = vlib_time_now (vm); + + error = vxlan_gpe_ioam_export_plugin_api_hookup (vm); + + /* Add our API messages to the global name_crc hash table */ + setup_message_id_table (em, &api_main); + + em->my_hbh_slot = ~0; + em->vlib_main = vm; + em->vnet_main = vnet_get_main (); + ioam_export_reset_next_node (em); + vec_free (name); + + return error; +} + +VLIB_INIT_FUNCTION (vxlan_gpe_ioam_export_init); + + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/export-vxlan-gpe/vxlan_gpe_ioam_export_all_api_h.h b/src/plugins/ioam/export-vxlan-gpe/vxlan_gpe_ioam_export_all_api_h.h new file mode 100644 index 00000000..6d93f093 --- /dev/null +++ b/src/plugins/ioam/export-vxlan-gpe/vxlan_gpe_ioam_export_all_api_h.h @@ -0,0 +1,16 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* Include the generated file, see BUILT_SOURCES in Makefile.am */ +#include <ioam/export-vxlan-gpe/vxlan_gpe_ioam_export.api.h> diff --git a/src/plugins/ioam/export-vxlan-gpe/vxlan_gpe_ioam_export_msg_enum.h b/src/plugins/ioam/export-vxlan-gpe/vxlan_gpe_ioam_export_msg_enum.h new file mode 100644 index 00000000..cc5698de --- /dev/null +++ b/src/plugins/ioam/export-vxlan-gpe/vxlan_gpe_ioam_export_msg_enum.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef included_vxlan_gpe_ioam_export_msg_enum_h +#define included_vxlan_gpe_ioam_export_msg_enum_h + +#include <vppinfra/byte_order.h> + +#define vl_msg_id(n,h) n, +typedef enum { +#include <ioam/export-vxlan-gpe/vxlan_gpe_ioam_export_all_api_h.h> + /* We'll want to know how many messages IDs we need... */ + VL_MSG_FIRST_AVAILABLE, +} vl_msg_id_t; +#undef vl_msg_id + +#endif /* included_vxlan_gpe_ioam_export_msg_enum_h */ diff --git a/src/plugins/ioam/export-vxlan-gpe/vxlan_gpe_ioam_export_test.c b/src/plugins/ioam/export-vxlan-gpe/vxlan_gpe_ioam_export_test.c new file mode 100644 index 00000000..17d31c95 --- /dev/null +++ b/src/plugins/ioam/export-vxlan-gpe/vxlan_gpe_ioam_export_test.c @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + *------------------------------------------------------------------ + * vxlan_gpe_ioam_export_test.c - test harness plugin + *------------------------------------------------------------------ + */ + +#include <vat/vat.h> +#include <vlibapi/api.h> +#include <vlibmemory/api.h> +#include <vlibsocket/api.h> +#include <vppinfra/error.h> + +#define __plugin_msg_base export_test_main.msg_id_base +#include <vlibapi/vat_helper_macros.h> + +/* Declare message IDs */ +#include <ioam/export-vxlan-gpe/vxlan_gpe_ioam_export_msg_enum.h> + +/* define message structures */ +#define vl_typedefs +#include <ioam/export-vxlan-gpe/vxlan_gpe_ioam_export_all_api_h.h> +#undef vl_typedefs + +/* declare message handlers for each api */ + +#define vl_endianfun /* define message structures */ +#include <ioam/export-vxlan-gpe/vxlan_gpe_ioam_export_all_api_h.h> +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) +#define vl_printfun +#include <ioam/export-vxlan-gpe/vxlan_gpe_ioam_export_all_api_h.h> +#undef vl_printfun + +/* Get the API version number. */ +#define vl_api_version(n,v) static u32 api_version=(v); +#include <ioam/export-vxlan-gpe/vxlan_gpe_ioam_export_all_api_h.h> +#undef vl_api_version + + +typedef struct +{ + /* API message ID base */ + u16 msg_id_base; + vat_main_t *vat_main; +} export_test_main_t; + +export_test_main_t export_test_main; + +#define foreach_standard_reply_retval_handler \ +_(vxlan_gpe_ioam_export_enable_disable_reply) + +#define _(n) \ + static void vl_api_##n##_t_handler \ + (vl_api_##n##_t * mp) \ + { \ + vat_main_t * vam = export_test_main.vat_main; \ + i32 retval = ntohl(mp->retval); \ + if (vam->async_mode) { \ + vam->async_errors += (retval < 0); \ + } else { \ + vam->retval = retval; \ + vam->result_ready = 1; \ + } \ + } +foreach_standard_reply_retval_handler; +#undef _ + +/* + * Table of message reply handlers, must include boilerplate handlers + * we just generated + */ +#define foreach_vpe_api_reply_msg \ +_(VXLAN_GPE_IOAM_EXPORT_ENABLE_DISABLE_REPLY, vxlan_gpe_ioam_export_enable_disable_reply) + +static int +api_vxlan_gpe_ioam_export_enable_disable (vat_main_t * vam) +{ + unformat_input_t *i = vam->input; + int is_disable = 0; + vl_api_vxlan_gpe_ioam_export_enable_disable_t *mp; + int ret; + + /* Parse args required to build the message */ + while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT) + { + if (unformat (i, "disable")) + is_disable = 1; + else + break; + } + + /* Construct the API message */ + M (VXLAN_GPE_IOAM_EXPORT_ENABLE_DISABLE, mp); + mp->is_disable = is_disable; + + /* send it... */ + S (mp); + + /* Wait for a reply... */ + W (ret); + return ret; +} + +/* + * List of messages that the api test plugin sends, + * and that the data plane plugin processes + */ +#define foreach_vpe_api_msg \ +_(vxlan_gpe_ioam_export_enable_disable, "<intfc> [disable]") + +static void +vxlan_gpe_ioam_vat_api_hookup (vat_main_t * vam) +{ + export_test_main_t *sm = &export_test_main; + /* Hook up handlers for replies from the data plane plug-in */ +#define _(N,n) \ + vl_msg_api_set_handlers((VL_API_##N + sm->msg_id_base), \ + #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_vpe_api_reply_msg; +#undef _ + + /* API messages we can send */ +#define _(n,h) hash_set_mem (vam->function_by_name, #n, api_##n); + foreach_vpe_api_msg; +#undef _ + + /* Help strings */ +#define _(n,h) hash_set_mem (vam->help_by_name, #n, h); + foreach_vpe_api_msg; +#undef _ +} + +clib_error_t * +vat_plugin_register (vat_main_t * vam) +{ + export_test_main_t *sm = &export_test_main; + u8 *name; + + sm->vat_main = vam; + + name = format (0, "vxlan_gpe_ioam_export_%08x%c", api_version, 0); + sm->msg_id_base = vl_client_get_first_plugin_msg_id ((char *) name); + + if (sm->msg_id_base != (u16) ~ 0) + vxlan_gpe_ioam_vat_api_hookup (vam); + + vec_free (name); + + return 0; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/export-vxlan-gpe/vxlan_gpe_ioam_export_thread.c b/src/plugins/ioam/export-vxlan-gpe/vxlan_gpe_ioam_export_thread.c new file mode 100644 index 00000000..618278c6 --- /dev/null +++ b/src/plugins/ioam/export-vxlan-gpe/vxlan_gpe_ioam_export_thread.c @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ioam_export_thread.c + */ +#include <vnet/api_errno.h> +#include <vppinfra/pool.h> +#include <ioam/export-common/ioam_export.h> + +static vlib_node_registration_t vxlan_gpe_ioam_export_process_node; +extern ioam_export_main_t vxlan_gpe_ioam_export_main; + +static uword +vxlan_gpe_ioam_export_process (vlib_main_t * vm, + vlib_node_runtime_t * rt, vlib_frame_t * f) +{ + return (ioam_export_process_common (&vxlan_gpe_ioam_export_main, + vm, rt, f, + vxlan_gpe_ioam_export_process_node.index)); +} + + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (vxlan_gpe_ioam_export_process_node, static) = +{ + .function = vxlan_gpe_ioam_export_process, + .type = VLIB_NODE_TYPE_PROCESS, + .name = "vxlan-gpe-ioam-export-process", +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/export-vxlan-gpe/vxlan_gpe_node.c b/src/plugins/ioam/export-vxlan-gpe/vxlan_gpe_node.c new file mode 100644 index 00000000..1395413a --- /dev/null +++ b/src/plugins/ioam/export-vxlan-gpe/vxlan_gpe_node.c @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vlib/vlib.h> +#include <vnet/vnet.h> +#include <vnet/pg/pg.h> +#include <vppinfra/error.h> +#include <vnet/ip/ip.h> +#include <vnet/vxlan-gpe/vxlan_gpe.h> +#include <vnet/vxlan-gpe/vxlan_gpe_packet.h> +#include <ioam/export-common/ioam_export.h> + +typedef struct +{ + u32 next_index; + u32 flow_label; +} export_trace_t; + +/* packet trace format function */ +static u8 * +format_export_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + export_trace_t *t = va_arg (*args, export_trace_t *); + + s = format (s, "EXPORT: flow_label %d, next index %d", + t->flow_label, t->next_index); + return s; +} + +vlib_node_registration_t vxlan_export_node; +extern vlib_node_registration_t export_node; +extern ioam_export_main_t vxlan_gpe_ioam_export_main; + +#define foreach_export_error \ +_(RECORDED, "Packets recorded for export") + +typedef enum +{ +#define _(sym,str) EXPORT_ERROR_##sym, + foreach_export_error +#undef _ + EXPORT_N_ERROR, +} export_error_t; + +static char *export_error_strings[] = { +#define _(sym,string) string, + foreach_export_error +#undef _ +}; + +typedef enum +{ + EXPORT_NEXT_VXLAN_GPE_INPUT, + EXPORT_N_NEXT, +} export_next_t; + +always_inline void +copy3cachelines (void *dst, const void *src, size_t n) +{ +#if 0 + if (PREDICT_FALSE (n < DEFAULT_EXPORT_SIZE)) + { + /* Copy only the first 1/2 cache lines whatever is available */ + if (n >= 64) + clib_mov64 ((u8 *) dst, (const u8 *) src); + if (n >= 128) + clib_mov64 ((u8 *) dst + 64, (const u8 *) src + 64); + return; + } + clib_mov64 ((u8 *) dst, (const u8 *) src); + clib_mov64 ((u8 *) dst + 64, (const u8 *) src + 64); + clib_mov64 ((u8 *) dst + 128, (const u8 *) src + 128); +#endif +#if 1 + + u64 *copy_dst, *copy_src; + int i; + copy_dst = (u64 *) dst; + copy_src = (u64 *) src; + if (PREDICT_FALSE (n < DEFAULT_EXPORT_SIZE)) + { + for (i = 0; i < n / 64; i++) + { + copy_dst[0] = copy_src[0]; + copy_dst[1] = copy_src[1]; + copy_dst[2] = copy_src[2]; + copy_dst[3] = copy_src[3]; + copy_dst[4] = copy_src[4]; + copy_dst[5] = copy_src[5]; + copy_dst[6] = copy_src[6]; + copy_dst[7] = copy_src[7]; + copy_dst += 8; + copy_src += 8; + } + return; + } + for (i = 0; i < 3; i++) + { + copy_dst[0] = copy_src[0]; + copy_dst[1] = copy_src[1]; + copy_dst[2] = copy_src[2]; + copy_dst[3] = copy_src[3]; + copy_dst[4] = copy_src[4]; + copy_dst[5] = copy_src[5]; + copy_dst[6] = copy_src[6]; + copy_dst[7] = copy_src[7]; + copy_dst += 8; + copy_src += 8; + } +#endif +} + +static void +vxlan_gpe_export_fixup_func (vlib_buffer_t * export_buf, + vlib_buffer_t * pak_buf) +{ + /* Todo: on implementing VXLAN GPE analyse */ +} + +static uword +vxlan_gpe_export_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + ioam_export_main_t *em = &vxlan_gpe_ioam_export_main; + ioam_export_node_common (em, vm, node, frame, ip4_header_t, length, + ip_version_and_header_length, + EXPORT_NEXT_VXLAN_GPE_INPUT, + vxlan_gpe_export_fixup_func); + return frame->n_vectors; +} + +/* + * Node for VXLAN-GPE export + */ +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (vxlan_export_node) = +{ + .function = vxlan_gpe_export_node_fn, + .name = "vxlan-gpe-ioam-export", + .vector_size = sizeof (u32), + .format_trace = format_export_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN (export_error_strings), + .error_strings = export_error_strings, + .n_next_nodes = EXPORT_N_NEXT, + /* edit / add dispositions here */ + .next_nodes = + {[EXPORT_NEXT_VXLAN_GPE_INPUT] = "vxlan-gpe-pop-ioam-v4"}, +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/export/ioam_export.api b/src/plugins/ioam/export/ioam_export.api new file mode 100644 index 00000000..bb830561 --- /dev/null +++ b/src/plugins/ioam/export/ioam_export.api @@ -0,0 +1,34 @@ +/* Hey Emacs use -*- mode: C -*- */ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Define a simple binary API to control the feature */ + +autoreply define ioam_export_ip6_enable_disable { + /* Client identifier, set from api_main.my_client_index */ + u32 client_index; + + /* Arbitrary context, so client can match reply to request */ + u32 context; + + /* Enable / disable the feature */ + u8 is_disable; + + /* Collector ip address */ + u8 collector_address[4]; + u8 src_address[4]; + + /* Src ip address */ +}; diff --git a/src/plugins/ioam/export/ioam_export.c b/src/plugins/ioam/export/ioam_export.c new file mode 100644 index 00000000..46ac3d4a --- /dev/null +++ b/src/plugins/ioam/export/ioam_export.c @@ -0,0 +1,249 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + *------------------------------------------------------------------ + * ioam_export.c - ioam export API / debug CLI handling + *------------------------------------------------------------------ + */ + +#include <vnet/vnet.h> +#include <vnet/plugin/plugin.h> +#include <ioam/export-common/ioam_export.h> + +#include <vlibapi/api.h> +#include <vlibmemory/api.h> +#include <vlibsocket/api.h> +#include <vnet/ip/ip6_hop_by_hop.h> + + +/* define message IDs */ +#include <ioam/export/ioam_export_msg_enum.h> + +/* define message structures */ +#define vl_typedefs +#include <ioam/export/ioam_export_all_api_h.h> +#undef vl_typedefs + +/* define generated endian-swappers */ +#define vl_endianfun +#include <ioam/export/ioam_export_all_api_h.h> +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__) +#define vl_printfun +#include <ioam/export/ioam_export_all_api_h.h> +#undef vl_printfun + +/* Get the API version number */ +#define vl_api_version(n,v) static u32 api_version=(v); +#include <ioam/export/ioam_export_all_api_h.h> +#undef vl_api_version + +#define REPLY_MSG_ID_BASE sm->msg_id_base +#include <vlibapi/api_helper_macros.h> + +/* List of message types that this plugin understands */ +#define foreach_ioam_export_plugin_api_msg \ +_(IOAM_EXPORT_IP6_ENABLE_DISABLE, ioam_export_ip6_enable_disable) + +ioam_export_main_t ioam_export_main; + +extern vlib_node_registration_t export_node; + +/* Action function shared between message handler and debug CLI */ + +int +ioam_export_ip6_enable_disable (ioam_export_main_t * em, + u8 is_disable, + ip4_address_t * collector_address, + ip4_address_t * src_address) +{ + vlib_main_t *vm = em->vlib_main; + + if (is_disable == 0) + { + if (1 == ioam_export_header_create (em, collector_address, src_address)) + { + ioam_export_thread_buffer_init (em, vm); + ip6_hbh_set_next_override (em->my_hbh_slot); + /* Turn on the export buffer check process */ + vlib_process_signal_event (vm, em->export_process_node_index, 1, 0); + + } + else + { + return (-2); + } + } + else + { + ip6_hbh_set_next_override (IP6_LOOKUP_NEXT_POP_HOP_BY_HOP); + ioam_export_header_cleanup (em, collector_address, src_address); + ioam_export_thread_buffer_free (em); + /* Turn off the export buffer check process */ + vlib_process_signal_event (vm, em->export_process_node_index, 2, 0); + + } + + return 0; +} + +/* API message handler */ +static void vl_api_ioam_export_ip6_enable_disable_t_handler + (vl_api_ioam_export_ip6_enable_disable_t * mp) +{ + vl_api_ioam_export_ip6_enable_disable_reply_t *rmp; + ioam_export_main_t *sm = &ioam_export_main; + int rv; + + rv = ioam_export_ip6_enable_disable (sm, (int) (mp->is_disable), + (ip4_address_t *) + mp->collector_address, + (ip4_address_t *) mp->src_address); + + REPLY_MACRO (VL_API_IOAM_EXPORT_IP6_ENABLE_DISABLE_REPLY); +} + +/* Set up the API message handling tables */ +static clib_error_t * +ioam_export_plugin_api_hookup (vlib_main_t * vm) +{ + ioam_export_main_t *sm = &ioam_export_main; +#define _(N,n) \ + vl_msg_api_set_handlers((VL_API_##N + sm->msg_id_base), \ + #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_ioam_export_plugin_api_msg; +#undef _ + + return 0; +} + +#define vl_msg_name_crc_list +#include <ioam/export/ioam_export_all_api_h.h> +#undef vl_msg_name_crc_list + +static void +setup_message_id_table (ioam_export_main_t * sm, api_main_t * am) +{ +#define _(id,n,crc) \ + vl_msg_api_add_msg_name_crc (am, #n "_" #crc, id + sm->msg_id_base); + foreach_vl_msg_name_crc_ioam_export; +#undef _ +} + +static clib_error_t * +set_ioam_export_ipfix_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + ioam_export_main_t *em = &ioam_export_main; + ip4_address_t collector, src; + u8 is_disable = 0; + + collector.as_u32 = 0; + src.as_u32 = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "collector %U", unformat_ip4_address, &collector)) + ; + else if (unformat (input, "src %U", unformat_ip4_address, &src)) + ; + else if (unformat (input, "disable")) + is_disable = 1; + else + break; + } + + if (collector.as_u32 == 0) + return clib_error_return (0, "collector address required"); + + if (src.as_u32 == 0) + return clib_error_return (0, "src address required"); + + em->ipfix_collector.as_u32 = collector.as_u32; + em->src_address.as_u32 = src.as_u32; + + vlib_cli_output (vm, "Collector %U, src address %U", + format_ip4_address, &em->ipfix_collector, + format_ip4_address, &em->src_address); + + /* Turn on the export timer process */ + // vlib_process_signal_event (vm, flow_report_process_node.index, + //1, 0); + ioam_export_ip6_enable_disable (em, is_disable, &collector, &src); + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (set_ipfix_command, static) = +{ +.path = "set ioam export ipfix",.short_help = + "set ioam export ipfix collector <ip4-address> src <ip4-address>",. + function = set_ioam_export_ipfix_command_fn,}; +/* *INDENT-ON* */ + + +static clib_error_t * +ioam_export_init (vlib_main_t * vm) +{ + ioam_export_main_t *em = &ioam_export_main; + clib_error_t *error = 0; + u8 *name; + u32 node_index = export_node.index; + vlib_node_t *ip6_hbyh_node = NULL; + + em->vlib_main = vm; + em->vnet_main = vnet_get_main (); + em->set_id = IPFIX_IOAM_EXPORT_ID; + ioam_export_reset_next_node (em); + + name = format (0, "ioam_export_%08x%c", api_version, 0); + + /* Ask for a correctly-sized block of API message decode slots */ + em->msg_id_base = vl_msg_api_get_msg_ids + ((char *) name, VL_MSG_FIRST_AVAILABLE); + em->unix_time_0 = (u32) time (0); /* Store starting time */ + em->vlib_time_0 = vlib_time_now (vm); + + error = ioam_export_plugin_api_hookup (vm); + + /* Add our API messages to the global name_crc hash table */ + setup_message_id_table (em, &api_main); + + /* Hook this export node to ip6-hop-by-hop */ + ip6_hbyh_node = vlib_get_node_by_name (vm, (u8 *) "ip6-hop-by-hop"); + em->my_hbh_slot = vlib_node_add_next (vm, ip6_hbyh_node->index, node_index); + vec_free (name); + + return error; +} + +VLIB_INIT_FUNCTION (ioam_export_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/export/ioam_export_all_api_h.h b/src/plugins/ioam/export/ioam_export_all_api_h.h new file mode 100644 index 00000000..bc4368f2 --- /dev/null +++ b/src/plugins/ioam/export/ioam_export_all_api_h.h @@ -0,0 +1,16 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* Include the generated file, see BUILT_SOURCES in Makefile.am */ +#include <ioam/export/ioam_export.api.h> diff --git a/src/plugins/ioam/export/ioam_export_msg_enum.h b/src/plugins/ioam/export/ioam_export_msg_enum.h new file mode 100644 index 00000000..c2de7988 --- /dev/null +++ b/src/plugins/ioam/export/ioam_export_msg_enum.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef included_ioam_export_msg_enum_h +#define included_ioam_export_msg_enum_h + +#include <vppinfra/byte_order.h> + +#define vl_msg_id(n,h) n, +typedef enum { +#include <ioam/export/ioam_export_all_api_h.h> + /* We'll want to know how many messages IDs we need... */ + VL_MSG_FIRST_AVAILABLE, +} vl_msg_id_t; +#undef vl_msg_id + +#endif /* included_ioam_export_msg_enum_h */ diff --git a/src/plugins/ioam/export/ioam_export_test.c b/src/plugins/ioam/export/ioam_export_test.c new file mode 100644 index 00000000..5023afd7 --- /dev/null +++ b/src/plugins/ioam/export/ioam_export_test.c @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + *------------------------------------------------------------------ + * ioam_export_test.c - test harness plugin + *------------------------------------------------------------------ + */ + +#include <vat/vat.h> +#include <vlibapi/api.h> +#include <vlibmemory/api.h> +#include <vlibsocket/api.h> +#include <vppinfra/error.h> + +#define __plugin_msg_base export_test_main.msg_id_base +#include <vlibapi/vat_helper_macros.h> + + +/* Declare message IDs */ +#include <ioam/export/ioam_export_msg_enum.h> + +/* define message structures */ +#define vl_typedefs +#include <ioam/export/ioam_export_all_api_h.h> +#undef vl_typedefs + +/* declare message handlers for each api */ + +#define vl_endianfun /* define message structures */ +#include <ioam/export/ioam_export_all_api_h.h> +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) +#define vl_printfun +#include <ioam/export/ioam_export_all_api_h.h> +#undef vl_printfun + +/* Get the API version number. */ +#define vl_api_version(n,v) static u32 api_version=(v); +#include <ioam/export/ioam_export_all_api_h.h> +#undef vl_api_version + + +typedef struct +{ + /* API message ID base */ + u16 msg_id_base; + vat_main_t *vat_main; +} export_test_main_t; + +export_test_main_t export_test_main; + +#define foreach_standard_reply_retval_handler \ +_(ioam_export_ip6_enable_disable_reply) + +#define _(n) \ + static void vl_api_##n##_t_handler \ + (vl_api_##n##_t * mp) \ + { \ + vat_main_t * vam = export_test_main.vat_main; \ + i32 retval = ntohl(mp->retval); \ + if (vam->async_mode) { \ + vam->async_errors += (retval < 0); \ + } else { \ + vam->retval = retval; \ + vam->result_ready = 1; \ + } \ + } +foreach_standard_reply_retval_handler; +#undef _ + +/* + * Table of message reply handlers, must include boilerplate handlers + * we just generated + */ +#define foreach_vpe_api_reply_msg \ +_(IOAM_EXPORT_IP6_ENABLE_DISABLE_REPLY, ioam_export_ip6_enable_disable_reply) + + +static int +api_ioam_export_ip6_enable_disable (vat_main_t * vam) +{ + unformat_input_t *i = vam->input; + int is_disable = 0; + vl_api_ioam_export_ip6_enable_disable_t *mp; + int ret; + + /* Parse args required to build the message */ + while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT) + { + if (unformat (i, "disable")) + is_disable = 1; + else + break; + } + + /* Construct the API message */ + M(IOAM_EXPORT_IP6_ENABLE_DISABLE, mp); + mp->is_disable = is_disable; + + /* send it... */ + S(mp); + + /* Wait for a reply... */ + W (ret); + return ret; +} + +/* + * List of messages that the api test plugin sends, + * and that the data plane plugin processes + */ +#define foreach_vpe_api_msg \ +_(ioam_export_ip6_enable_disable, "<intfc> [disable]") + +static void +ioam_export_vat_api_hookup (vat_main_t * vam) +{ + export_test_main_t *sm = &export_test_main; + /* Hook up handlers for replies from the data plane plug-in */ +#define _(N,n) \ + vl_msg_api_set_handlers((VL_API_##N + sm->msg_id_base), \ + #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_vpe_api_reply_msg; +#undef _ + + /* API messages we can send */ +#define _(n,h) hash_set_mem (vam->function_by_name, #n, api_##n); + foreach_vpe_api_msg; +#undef _ + + /* Help strings */ +#define _(n,h) hash_set_mem (vam->help_by_name, #n, h); + foreach_vpe_api_msg; +#undef _ +} + +clib_error_t * +vat_plugin_register (vat_main_t * vam) +{ + export_test_main_t *sm = &export_test_main; + u8 *name; + + sm->vat_main = vam; + + name = format (0, "ioam_export_%08x%c", api_version, 0); + sm->msg_id_base = vl_client_get_first_plugin_msg_id ((char *) name); + + if (sm->msg_id_base != (u16) ~ 0) + ioam_export_vat_api_hookup (vam); + + vec_free (name); + + return 0; +} diff --git a/src/plugins/ioam/export/ioam_export_thread.c b/src/plugins/ioam/export/ioam_export_thread.c new file mode 100644 index 00000000..5f1d9643 --- /dev/null +++ b/src/plugins/ioam/export/ioam_export_thread.c @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ioam_export_thread.c + */ +#include <vnet/api_errno.h> +#include <vppinfra/pool.h> +#include <ioam/export-common/ioam_export.h> + +static vlib_node_registration_t ioam_export_process_node; +extern ioam_export_main_t ioam_export_main; + +static uword +ioam_export_process (vlib_main_t * vm, + vlib_node_runtime_t * rt, vlib_frame_t * f) +{ + return (ioam_export_process_common(&ioam_export_main, + vm, rt, f, + ioam_export_process_node.index)); +} + +VLIB_REGISTER_NODE (ioam_export_process_node, static) = +{ + .function = ioam_export_process, + .type = VLIB_NODE_TYPE_PROCESS, + .name = "ioam-export-process", +}; diff --git a/src/plugins/ioam/export/node.c b/src/plugins/ioam/export/node.c new file mode 100644 index 00000000..9b61c902 --- /dev/null +++ b/src/plugins/ioam/export/node.c @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vlib/vlib.h> +#include <vnet/vnet.h> +#include <vnet/pg/pg.h> +#include <vppinfra/error.h> +#include <vnet/ip/ip.h> +#include <vnet/ip/ip6_hop_by_hop.h> +#include <ioam/export-common/ioam_export.h> + + +typedef struct +{ + u32 next_index; + u32 flow_label; +} export_trace_t; + +/* packet trace format function */ +static u8 * +format_export_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + export_trace_t *t = va_arg (*args, export_trace_t *); + + s = format (s, "EXPORT: flow_label %d, next index %d", + t->flow_label, t->next_index); + return s; +} + +vlib_node_registration_t export_node; +extern ioam_export_main_t ioam_export_main; + +#define foreach_export_error \ +_(RECORDED, "Packets recorded for export") + +typedef enum +{ +#define _(sym,str) EXPORT_ERROR_##sym, + foreach_export_error +#undef _ + EXPORT_N_ERROR, +} export_error_t; + +static char *export_error_strings[] = { +#define _(sym,string) string, + foreach_export_error +#undef _ +}; + +typedef enum +{ + EXPORT_NEXT_POP_HBYH, + EXPORT_N_NEXT, +} export_next_t; + +always_inline void +copy3cachelines (void *dst, const void *src, size_t n) +{ +#if 0 + if (PREDICT_FALSE (n < DEFAULT_EXPORT_SIZE)) + { + /* Copy only the first 1/2 cache lines whatever is available */ + if (n >= 64) + clib_mov64 ((u8 *) dst, (const u8 *) src); + if (n >= 128) + clib_mov64 ((u8 *) dst + 64, (const u8 *) src + 64); + return; + } + clib_mov64 ((u8 *) dst, (const u8 *) src); + clib_mov64 ((u8 *) dst + 64, (const u8 *) src + 64); + clib_mov64 ((u8 *) dst + 128, (const u8 *) src + 128); +#endif +#if 1 + + u64 *copy_dst, *copy_src; + int i; + copy_dst = (u64 *) dst; + copy_src = (u64 *) src; + if (PREDICT_FALSE (n < DEFAULT_EXPORT_SIZE)) + { + for (i = 0; i < n / 64; i++) + { + copy_dst[0] = copy_src[0]; + copy_dst[1] = copy_src[1]; + copy_dst[2] = copy_src[2]; + copy_dst[3] = copy_src[3]; + copy_dst[4] = copy_src[4]; + copy_dst[5] = copy_src[5]; + copy_dst[6] = copy_src[6]; + copy_dst[7] = copy_src[7]; + copy_dst += 8; + copy_src += 8; + } + return; + } + for (i = 0; i < 3; i++) + { + copy_dst[0] = copy_src[0]; + copy_dst[1] = copy_src[1]; + copy_dst[2] = copy_src[2]; + copy_dst[3] = copy_src[3]; + copy_dst[4] = copy_src[4]; + copy_dst[5] = copy_src[5]; + copy_dst[6] = copy_src[6]; + copy_dst[7] = copy_src[7]; + copy_dst += 8; + copy_src += 8; + } +#endif +} + +static void +ip6_export_fixup_func (vlib_buffer_t * export_buf, vlib_buffer_t * pak_buf) +{ + ip6_header_t *ip6_temp = + (ip6_header_t *) (export_buf->data + export_buf->current_length); + u32 flow_label_temp = + clib_net_to_host_u32(ip6_temp->ip_version_traffic_class_and_flow_label) + & 0xFFF00000; + flow_label_temp |= + IOAM_MASK_DECAP_BIT((vnet_buffer(pak_buf)->l2_classify.opaque_index)); + ip6_temp->ip_version_traffic_class_and_flow_label = + clib_host_to_net_u32(flow_label_temp); +} + +static uword +ip6_export_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + ioam_export_main_t *em = &ioam_export_main; + ioam_export_node_common(em, vm, node, frame, ip6_header_t, payload_length, + ip_version_traffic_class_and_flow_label, + EXPORT_NEXT_POP_HBYH, ip6_export_fixup_func); + return frame->n_vectors; +} + +/* + * Node for IP6 export + */ +VLIB_REGISTER_NODE (export_node) = +{ + .function = ip6_export_node_fn, + .name = "ip6-export", + .vector_size = sizeof (u32), + .format_trace = format_export_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN (export_error_strings), + .error_strings = export_error_strings, + .n_next_nodes = EXPORT_N_NEXT, + /* edit / add dispositions here */ + .next_nodes = + { + [EXPORT_NEXT_POP_HBYH] = "ip6-pop-hop-by-hop" + }, +}; diff --git a/src/plugins/ioam/ioam_plugin_doc.md b/src/plugins/ioam/ioam_plugin_doc.md new file mode 100644 index 00000000..343abcf7 --- /dev/null +++ b/src/plugins/ioam/ioam_plugin_doc.md @@ -0,0 +1,464 @@ +## VPP Inband OAM (iOAM) {#ioam_plugin_doc} + +In-band OAM (iOAM) is an implementation study to record operational +information in the packet while the packet traverses a path between +two points in the network. + +Overview of iOAM can be found in [iOAM-Devnet] page. +The following IETF drafts detail the motivation and mechanism for +recording operational information: + - [iOAM-ietf-requirements] - Describes motivation and usecases for iOAM + - [iOAM-ietf-data] - Describes data records that can be collected using iOAM + - [iOAM-ietf-transport] - Lists out the transport protocols + and mechanism to carry iOAM data records + - [iOAM-ietf-proof-of-transit] - Describes the idea of Proof of Transit (POT) + and mechanisms to operationalize the idea + +## Terminology +In-band OAM is expected to be deployed in a specific domain rather +than on the overall Internet. The part of the network which employs in-band OAM +is referred to as **"in-band OAM-domain"**. + +In-band OAM data is added to a packet on entering the in-band OAM-domain +and is removed from the packet when exiting the domain. +Within the in-band OAM-domain, network nodes that the packet traverses +may update the in-band OAM data records. + +- The node which adds in-band OAM data to the packet is called the +**"in-band OAM encapsulating node"**. + +- The node which removes the in-band OAM data is referred to as the +**"in-band OAM decapsulating node"**. + +- Nodes within the domain which are aware of in-band OAM data and read +and/or write or process the in-band OAM data are called +**"in-band OAM transit nodes"**. + +## Features supported in the current release +VPP can function as in-band OAM encapsulating, transit and decapsulating node. +In this version of VPP in-band OAM data is transported as options in an +IPv6 hop-by-hop extension header. Hence in-band OAM can be enabled +for IPv6 traffic. + +The following iOAM features are supported: + +- **In-band OAM Tracing** : In-band OAM supports multiple data records to be +recorded in the packet as the packet traverses the network. +These data records offer insights into the operational behavior of the network. +The following information can be collected in the tracing +data from the nodes a packet traverses: + - Node ID + - Ingress interface ID + - Egress interface ID + - Timestamp + - Pre-configured application data + +- **In-band OAM Proof of Transit (POT)**: Proof of transit iOAM data is +added to every packet for verifying that a packet traverses a specific +set of nodes. +In-band OAM data is updated at every node that is enabled with iOAM +proof of transit and is used to verify whether a packet traversed +all the specified nodes. When the verifier receives each packet, +it can validate whether the packet traversed the specified nodes. + + +## Configuration +Configuring iOAM involves: +- Selecting the packets for which iOAM data must be inserted, updated or removed + - Selection of packets for iOAM data insertion on iOAM encapsulating node. + Selection of packets is done by 5-tuple based classification + - Selection of packets for updating iOAM data is implicitly done on the + presence of iOAM options in the packet + - Selection of packets for removing the iOAM data is done on 5-tuple + based classification +- The kind of data to be collected + - Tracing data + - Proof of transit +- Additional details for processing iOAM data to be collected + - For trace data - trace type, number of nodes to be recorded in the trace, + time stamp precision, etc. + - For POT data - configuration of POT profile required to process the POT data + +The CLI for configuring iOAM is explained here followed by detailed steps +and examples to deploy iOAM on VPP as an encapsulating, transit or +decapsulating iOAM node in the subsequent sub-sections. + +VPP iOAM configuration for enabling trace and POT is as follows: + + set ioam rewrite trace-type <0x1f|0x7|0x9|0x11|0x19> + trace-elts <number of trace elements> trace-tsp <0|1|2|3> + node-id <node ID in hex> app-data <application data in hex> [pot] + +A description of each of the options of the CLI follows: +- trace-type : An entry in the "Node data List" array of the trace option +can have different formats, following the needs of the a deployment. +For example: Some deployments might only be interested +in recording the node identifiers, whereas others might be interested +in recording node identifier and timestamp. +The following types are currently supported: + - 0x1f : Node data to include hop limit (8 bits), node ID (24 bits), + ingress and egress interface IDs (16 bits each), timestamp (32 bits), + application data (32 bits) + - 0x7 : Node data to include hop limit (8 bits), node ID (24 bits), + ingress and egress interface IDs (16 bits each) + - 0x9 : Node data to include hop limit (8 bits), node ID (24 bits), + timestamp (32 bits) + - 0x11: Node data to include hop limit (8 bits), node ID (24 bits), + application data (32 bits) + - 0x19: Node data to include hop limit (8 bits), node ID (24 bits), + timestamp (32 bits), application data (32 bits) +- trace-elts : Defines the length of the node data array in the trace option. +- trace-tsp : Defines the timestamp precision to use with the enumerated value + for precision as follows: + - 0 : 32bits timestamp in seconds + - 1 : 32bits timestamp in milliseconds + - 2 : 32bits timestamp in microseconds + - 3 : 32bits timestamp in nanoseconds +- node-id : Unique identifier for the node, included in the node ID + field of the node data in trace option. +- app-data : The value configured here is included as is in +application data field of node data in trace option. +- pot : Enables POT option to be included in the iOAM options. + +### Trace configuration + +#### On in-band OAM encapsulating node + - **Configure classifier and apply ACL** to select packets for + iOAM data insertion + - Example to enable iOAM data insertion for all the packets + towards IPv6 address db06::06: + + vpp# classify table miss-next node ip6-lookup mask l3 ip6 dst + + vpp# classify session acl-hit-next node ip6-add-hop-by-hop + table-index 0 match l3 ip6 dst db06::06 + + vpp# set int input acl intfc GigabitEthernet0/0/0 ip6-table 0 + + - **Enable tracing** : Specify node ID, maximum number of nodes for which + trace data should be recorded, type of data to be included for recording, + optionally application data to be included + - Example to enable tracing with a maximum of 4 nodes recorded + and the data to be recorded to include - hop limit, node id, + ingress and egress interface IDs, timestamp (millisecond precision), + application data (0x1234): + + + vpp# set ioam rewrite trace-type 0x1f trace-elts 4 trace-tsp 1 + node-id 0x1 app-data 0x1234 + + + +#### On in-band OAM transit node +- The transit node requires trace type, timestamp precision, node ID and +optionally application data to be configured, +to update its node data in the trace option. + +Example: + + vpp# set ioam rewrite trace-type 0x1f trace-elts 4 trace-tsp 1 + node-id 0x2 app-data 0x1234 + +#### On the In-band OAM decapsulating node +- The decapsulating node similar to encapsulating node requires +**classification** of the packets to remove iOAM data from. + - Example to decapsulate iOAM data for packets towards + db06::06, configure classifier and enable it as an ACL as follows: + + + vpp# classify table miss-next node ip6-lookup mask l3 ip6 dst + + vpp# classify session acl-hit-next node ip6-lookup table-index 0 + match l3 ip6 dst db06::06 opaque-index 100 + + vpp# set int input acl intfc GigabitEthernet0/0/0 ip6-table 0 + + +- Decapsulating node requires trace type, timestamp precision, +node ID and optionally application data to be configured, +to update its node data in the trace option before it is decapsulated. + +Example: + + vpp# set ioam rewrite trace-type 0x1f trace-elts 4 + trace-tsp 1 node-id 0x3 app-data 0x1234 + + +### Proof of Transit configuration + +For details on proof-of-transit, +see the IETF draft [iOAM-ietf-proof-of-transit]. +To enable Proof of Transit all the nodes that participate +and hence are verified for transit need a proof of transit profile. +A script to generate a proof of transit profile as per the mechanism +described in [iOAM-ietf-proof-of-transit] will be available at [iOAM-Devnet]. + +The Proof of transit mechanism implemented here is based on +Shamir's Secret Sharing algorithm. +The overall algorithm uses two polynomials +POLY-1 and POLY-2. The degree of polynomials depends on number of nodes +to be verified for transit. +POLY-1 is secret and constant. Each node gets a point on POLY-1 +at setup-time and keeps it secret. +POLY-2 is public, random and per packet. +Each node is assigned a point on POLY-1 and POLY-2 with the same x index. +Each node derives its point on POLY-2 each time a packet arrives at it. +A node then contributes its points on POLY-1 and POLY-2 to construct +POLY-3 (POLY-3 = POLY-1 + POLY-2) using lagrange extrapolation and +forwards it towards the verifier by updating POT data in the packet. +The verifier constructs POLY-3 from the accumulated value from all the nodes +and its own points on POLY-1 and POLY-2 and verifies whether +POLY-3 = POLY-1 + POLY-2. Only the verifier knows POLY-1. +The solution leverages finite field arithmetic in a field of size "prime number" +for reasons explained in description of Shamir's secret sharing algorithm. + +Here is an explanation of POT profile list and profile configuration CLI to +realize the above mechanism. +It is best to use the script provided at [iOAM-Devnet] to generate +this configuration. +- **Create POT profile** : set pot profile name <string> id [0-1] +[validator-key 0xu64] prime-number 0xu64 secret_share 0xu64 +lpc 0xu64 polynomial2 0xu64 bits-in-random [0-64] + - name : Profile list name. + - id : Profile id, it can be 0 or 1. + A maximum of two profiles can be configured per profile list. + - validator-key : Secret key configured only on the + verifier/decapsulating node used to compare and verify proof of transit. + - prime-number : Prime number for finite field arithmetic as required by the + proof of transit mechanism. + - secret_share : Unique point for each node on the secret polynomial POLY-1. + - lpc : Lagrange Polynomial Constant(LPC) calculated per node based on + its point (x value used for evaluating the points on the polynomial) + on the polynomial used in lagrange extrapolation + for reconstructing polynomial (POLY-3). + - polynomial2 : Is the pre-evaluated value of the point on + 2nd polynomial(POLY-2). This is unique for each node. + It is pre-evaluated for all the coefficients of POLY-2 except + for the constant part of the polynomial that changes per packet + and is received as part of the POT data in the packet. + - bits-in-random : To control the size of the random number to be + generated. This number has to match the other numbers generated and used + in the profile as per the algorithm. + +- **Set a configured profile as active/in-use** : +set pot profile-active name <string> ID [0-1] + - name : Name of the profile list to be used for computing + POT data per packet. + - ID : Identifier of the profile within the list to be used. + +#### On In-band OAM encapsulating node + - Configure the classifier and apply ACL to select packets for iOAM data insertion. + - Example to enable iOAM data insertion for all the packet towards + IPv6 address db06::06 - + + + vpp# classify table miss-next node ip6-lookup mask l3 ip6 dst + + vpp# classify session acl-hit-next node + ip6-add-hop-by-hop table-index 0 match l3 ip6 dst db06::06 + + vpp# set int input acl intfc GigabitEthernet0/0/0 ip6-table 0 + + + - Configure the proof of transit profile list with profiles. +Each profile list referred to by a name can contain 2 profiles, +only one is in use for updating proof of transit data at any time. + - Example profile list example with a profile generated from the + script to verify transit through 3 nodes is: + + + vpp# set pot profile name example id 0 prime-number 0x7fff0000fa884685 + secret_share 0x6c22eff0f45ec56d lpc 0x7fff0000fa884682 + polynomial2 0xffb543d4a9c bits-in-random 63 + + - Enable one of the profiles from the configured profile list as active + so that is will be used for calculating proof of transit + +Example enable profile ID 0 from profile list example configured above: + + + vpp# set pot profile-active name example ID 0 + + + - Enable POT option to be inserted + + + vpp# set ioam rewrite pot + + +#### On in-band OAM transit node + - Configure the proof of transit profile list with profiles for transit node. +Example: + + + vpp# set pot profile name example id 0 prime-number 0x7fff0000fa884685 + secret_share 0x564cdbdec4eb625d lpc 0x1 + polynomial2 0x23f3a227186a bits-in-random 63 + +#### On in-band OAM decapsulating node / verifier +- The decapsulating node, similar to the encapsulating node requires +classification of the packets to remove iOAM data from. + - Example to decapsulate iOAM data for packets towards db06::06 + configure classifier and enable it as an ACL as follows: + + + vpp# classify table miss-next node ip6-lookup mask l3 ip6 dst + + vpp# classify session acl-hit-next node ip6-lookup table-index 0 + match l3 ip6 dst db06::06 opaque-index 100 + + vpp# set int input acl intfc GigabitEthernet0/0/0 ip6-table 0 + +- To update and verify the proof of transit, POT profile list should be configured. + - Example POT profile list configured as follows: + + vpp# set pot profile name example id 0 validate-key 0x7fff0000fa88465d + prime-number 0x7fff0000fa884685 secret_share 0x7a08fbfc5b93116d lpc 0x3 + polynomial2 0x3ff738597ce bits-in-random 63 + +## Operational data + +Following CLIs are available to check iOAM operation: +- To check iOAM configuration that are effective use "show ioam summary" + +Example: + + vpp# show ioam summary + REWRITE FLOW CONFIGS - Not configured + HOP BY HOP OPTIONS - TRACE CONFIG - + Trace Type : 0x1f (31) + Trace timestamp precision : 1 (Milliseconds) + Num of trace nodes : 4 + Node-id : 0x2 (2) + App Data : 0x1234 (4660) + POT OPTION - 1 (Enabled) + Try 'show ioam pot and show pot profile' for more information + +- To find statistics about packets for which iOAM options were +added (encapsulating node) and removed (decapsulating node) execute +*show errors* + +Example on encapsulating node: + + + vpp# show error + Count Node Reason + 1208804706 ip6-inacl input ACL hits + 1208804706 ip6-add-hop-by-hop Pkts w/ added ip6 hop-by-hop options + +Example on decapsulating node: + + vpp# show error + Count Node Reason + 69508569 ip6-inacl input ACL hits + 69508569 ip6-pop-hop-by-hop Pkts w/ removed ip6 hop-by-hop options + +- To check the POT profiles use "show pot profile" + +Example: + + vpp# show pot profile + Profile list in use : example + POT Profile at index: 0 + ID : 0 + Validator : False (0) + Secret share : 0x564cdbdec4eb625d (6218586935324795485) + Prime number : 0x7fff0000fa884685 (9223090566081300101) + 2nd polynomial(eval) : 0x23f3a227186a (39529304496234) + LPC : 0x1 (1) + Bit mask : 0x7fffffffffffffff (9223372036854775807) + Profile index in use: 0 + Pkts passed : 0x36 (54) + +- To get statistics of POT for packets use "show ioam pot" + +Example at encapsulating or transit node: + + vpp# show ioam pot + Pkts with ip6 hop-by-hop POT options - 54 + Pkts with ip6 hop-by-hop POT options but no profile set - 0 + Pkts with POT in Policy - 0 + Pkts with POT out of Policy - 0 + + +Example at decapsulating/verification node: + + + vpp# show ioam pot + Pkts with ip6 hop-by-hop POT options - 54 + Pkts with ip6 hop-by-hop POT options but no profile set - 0 + Pkts with POT in Policy - 54 + Pkts with POT out of Policy - 0 + +- Tracing - enable trace of IPv6 packets to view the data inserted and +collected. + +Example when the nodes are receiving data over a DPDK interface: +Enable tracing using "trace add dpdk-input 20" and +execute "show trace" to view the iOAM data collected: + + + vpp# trace add dpdk-input 20 + + vpp# show trace + + ------------------- Start of thread 0 vpp_main ------------------- + + Packet 1 + + 00:00:19:294697: dpdk-input + GigabitEthernetb/0/0 rx queue 0 + buffer 0x10e6b: current data 0, length 214, free-list 0, totlen-nifb 0, trace 0x0 + PKT MBUF: port 0, nb_segs 1, pkt_len 214 + buf_len 2176, data_len 214, ol_flags 0x0, data_off 128, phys_addr 0xe9a35a00 + packet_type 0x0 + IP6: 00:50:56:9c:df:72 -> 00:50:56:9c:be:55 + IP6_HOP_BY_HOP_OPTIONS: db05::2 -> db06::6 + tos 0x00, flow label 0x0, hop limit 63, payload length 160 + 00:00:19:294737: ethernet-input + IP6: 00:50:56:9c:df:72 -> 00:50:56:9c:be:55 + 00:00:19:294753: ip6-input + IP6_HOP_BY_HOP_OPTIONS: db05::2 -> db06::6 + tos 0x00, flow label 0x0, hop limit 63, payload length 160 + 00:00:19:294757: ip6-lookup + fib 0 adj-idx 15 : indirect via db05::2 flow hash: 0x00000000 + IP6_HOP_BY_HOP_OPTIONS: db05::2 -> db06::6 + tos 0x00, flow label 0x0, hop limit 63, payload length 160 + 00:00:19:294802: ip6-hop-by-hop + IP6_HOP_BY_HOP: next index 5 len 96 traced 96 Trace Type 0x1f , 1 elts left + [0] ttl 0x0 node ID 0x0 ingress 0x0 egress 0x0 ts 0x0 + app 0x0 + [1] ttl 0x3e node ID 0x3 ingress 0x1 egress 0x2 ts 0xb68c2213 + app 0x1234 + [2] ttl 0x3f node ID 0x2 ingress 0x1 egress 0x2 ts 0xb68c2204 + app 0x1234 + [3] ttl 0x40 node ID 0x1 ingress 0x5 egress 0x6 ts 0xb68c2200 + app 0x1234 + POT opt present + random = 0x577a916946071950, Cumulative = 0x10b46e78a35a392d, Index = 0x0 + 00:00:19:294810: ip6-rewrite + tx_sw_if_index 1 adj-idx 14 : GigabitEthernetb/0/0 + IP6: 00:50:56:9c:be:55 -> 00:50:56:9c:df:72 flow hash: 0x00000000 + IP6: 00:50:56:9c:be:55 -> 00:50:56:9c:df:72 + IP6_HOP_BY_HOP_OPTIONS: db05::2 -> db06::6 + tos 0x00, flow label 0x0, hop limit 62, payload length 160 + 00:00:19:294814: GigabitEthernetb/0/0-output + GigabitEthernetb/0/0 + IP6: 00:50:56:9c:be:55 -> 00:50:56:9c:df:72 + IP6_HOP_BY_HOP_OPTIONS: db05::2 -> db06::6 + tos 0x00, flow label 0x0, hop limit 62, payload length 160 + 00:00:19:294820: GigabitEthernetb/0/0-tx + GigabitEthernetb/0/0 tx queue 0 + buffer 0x10e6b: current data 0, length 214, free-list 0, totlen-nifb 0, trace 0x0 + IP6: 00:50:56:9c:be:55 -> 00:50:56:9c:df:72 + + IP6_HOP_BY_HOP_OPTIONS: db05::2 -> db06::6 + + tos 0x00, flow label 0x0, hop limit 62, payload length 160 + + +[iOAM-Devnet]: <https://github.com/ciscodevnet/iOAM> +[iOAM-ietf-requirements]:<https://tools.ietf.org/html/draft-brockners-inband-oam-requirements-01> +[iOAM-ietf-transport]:<https://tools.ietf.org/html/draft-brockners-inband-oam-transport-01> +[iOAM-ietf-data]:<https://tools.ietf.org/html/draft-brockners-inband-oam-data-01> +[iOAM-ietf-proof-of-transit]:<https://tools.ietf.org/html/draft-brockners-proof-of-transit-01> diff --git a/src/plugins/ioam/ip6/ioam_cache.api b/src/plugins/ioam/ip6/ioam_cache.api new file mode 100644 index 00000000..dd9c0186 --- /dev/null +++ b/src/plugins/ioam/ip6/ioam_cache.api @@ -0,0 +1,29 @@ +/* Hey Emacs use -*- mode: C -*- */ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* API to control ioam caching */ + +autoreply define ioam_cache_ip6_enable_disable { + /* Client identifier, set from api_main.my_client_index */ + u32 client_index; + + /* Arbitrary context, so client can match reply to request */ + u32 context; + + /* Enable / disable the feature */ + u8 is_disable; + +}; diff --git a/src/plugins/ioam/ip6/ioam_cache.c b/src/plugins/ioam/ip6/ioam_cache.c new file mode 100644 index 00000000..4c9997f4 --- /dev/null +++ b/src/plugins/ioam/ip6/ioam_cache.c @@ -0,0 +1,417 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + *------------------------------------------------------------------ + * ioam_cache.c - ioam ip6 API / debug CLI handling + *------------------------------------------------------------------ + */ + +#include <vnet/vnet.h> +#include <vnet/plugin/plugin.h> +#include <ioam/ip6/ioam_cache.h> + +#include <vlibapi/api.h> +#include <vlibmemory/api.h> +#include <vlibsocket/api.h> +#include <vnet/ip/ip6_hop_by_hop.h> + +#include "ioam_cache.h" + +/* define message IDs */ +#include <ioam/ip6/ioam_cache_msg_enum.h> + +/* define message structures */ +#define vl_typedefs +#include <ioam/ip6/ioam_cache_all_api_h.h> +#undef vl_typedefs + +/* define generated endian-swappers */ +#define vl_endianfun +#include <ioam/ip6/ioam_cache_all_api_h.h> +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__) +#define vl_printfun +#include <ioam/ip6/ioam_cache_all_api_h.h> +#undef vl_printfun + +/* Get the API version number */ +#define vl_api_version(n,v) static u32 api_version=(v); +#include <ioam/ip6/ioam_cache_all_api_h.h> +#undef vl_api_version + +#define REPLY_MSG_ID_BASE cm->msg_id_base +#include <vlibapi/api_helper_macros.h> + +/* List of message types that this plugin understands */ +#define foreach_ioam_cache_plugin_api_msg \ +_(IOAM_CACHE_IP6_ENABLE_DISABLE, ioam_cache_ip6_enable_disable) + +static u8 * +ioam_e2e_id_trace_handler (u8 * s, ip6_hop_by_hop_option_t * opt) +{ + ioam_e2e_id_option_t *e2e = (ioam_e2e_id_option_t *) opt; + + if (e2e) + { + s = + format (s, "IP6_HOP_BY_HOP E2E ID = %U\n", format_ip6_address, + &(e2e->id)); + } + + + return s; +} + +static u8 * +ioam_e2e_cache_trace_handler (u8 * s, ip6_hop_by_hop_option_t * opt) +{ + ioam_e2e_cache_option_t *e2e = (ioam_e2e_cache_option_t *) opt; + + if (e2e) + { + s = + format (s, "IP6_HOP_BY_HOP E2E CACHE = pool:%d idx:%d\n", + e2e->pool_id, e2e->pool_index); + } + + + return s; +} + +/* Action function shared between message handler and debug CLI */ +int +ioam_cache_ip6_enable_disable (ioam_cache_main_t * em, + ip6_address_t * sr_localsid, u8 is_disable) +{ + vlib_main_t *vm = em->vlib_main; + + if (is_disable == 0) + { + ioam_cache_table_init (vm); + em->sr_localsid_cache.as_u64[0] = sr_localsid->as_u64[0]; + em->sr_localsid_cache.as_u64[1] = sr_localsid->as_u64[1]; + ip6_hbh_set_next_override (em->cache_hbh_slot); + ip6_hbh_register_option (HBH_OPTION_TYPE_IOAM_EDGE_TO_EDGE_ID, + 0, ioam_e2e_id_trace_handler); + ip6_hbh_register_option (HBH_OPTION_TYPE_IOAM_E2E_CACHE_ID, + 0, ioam_e2e_cache_trace_handler); + + } + else + { + ip6_hbh_set_next_override (IP6_LOOKUP_NEXT_POP_HOP_BY_HOP); + ioam_cache_table_destroy (vm); + em->sr_localsid_cache.as_u64[0] = 0; + em->sr_localsid_cache.as_u64[1] = 0; + ip6_hbh_unregister_option (HBH_OPTION_TYPE_IOAM_EDGE_TO_EDGE_ID); + ip6_hbh_unregister_option (HBH_OPTION_TYPE_IOAM_E2E_CACHE_ID); + } + + return 0; +} + +/* Action function shared between message handler and debug CLI */ +int +ioam_tunnel_select_ip6_enable_disable (ioam_cache_main_t * em, + u8 criteria, + u8 no_of_responses, + ip6_address_t * sr_localsid, + u8 is_disable) +{ + vlib_main_t *vm = em->vlib_main; + + if (is_disable == 0) + { + ioam_cache_ts_table_init (vm); + em->criteria_oneway = criteria; + em->wait_for_responses = no_of_responses; + em->sr_localsid_ts.as_u64[0] = sr_localsid->as_u64[0]; + em->sr_localsid_ts.as_u64[1] = sr_localsid->as_u64[1]; + ip6_hbh_set_next_override (em->ts_hbh_slot); + ip6_ioam_ts_cache_set_rewrite (); + ip6_hbh_register_option (HBH_OPTION_TYPE_IOAM_EDGE_TO_EDGE_ID, + 0, ioam_e2e_id_trace_handler); + ip6_hbh_register_option (HBH_OPTION_TYPE_IOAM_E2E_CACHE_ID, + 0, ioam_e2e_cache_trace_handler); + + /* Turn on the cleanup process */ + // vlib_process_signal_event (vm, em->cleanup_process_node_index, 1, 0); + } + else + { + ioam_cache_ts_timer_node_enable (vm, 0); + ip6_hbh_set_next_override (IP6_LOOKUP_NEXT_POP_HOP_BY_HOP); + em->sr_localsid_ts.as_u64[0] = 0; + em->sr_localsid_ts.as_u64[1] = 0; + ioam_cache_ts_table_destroy (vm); + ip6_ioam_ts_cache_cleanup_rewrite (); + ip6_hbh_unregister_option (HBH_OPTION_TYPE_IOAM_EDGE_TO_EDGE_ID); + ip6_hbh_unregister_option (HBH_OPTION_TYPE_IOAM_E2E_CACHE_ID); + } + + return 0; +} + +/* API message handler */ +static void vl_api_ioam_cache_ip6_enable_disable_t_handler + (vl_api_ioam_cache_ip6_enable_disable_t * mp) +{ + vl_api_ioam_cache_ip6_enable_disable_reply_t *rmp; + ioam_cache_main_t *cm = &ioam_cache_main; + ip6_address_t sr_localsid; + int rv; + + sr_localsid.as_u64[0] = 0; + sr_localsid.as_u64[1] = 0; + rv = + ioam_cache_ip6_enable_disable (cm, &sr_localsid, (int) (mp->is_disable)); + REPLY_MACRO (VL_API_IOAM_CACHE_IP6_ENABLE_DISABLE_REPLY); +} + +/* Set up the API message handling tables */ +static clib_error_t * +ioam_cache_plugin_api_hookup (vlib_main_t * vm) +{ + ioam_cache_main_t *sm = &ioam_cache_main; +#define _(N,n) \ + vl_msg_api_set_handlers((VL_API_##N + sm->msg_id_base), \ + #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_ioam_cache_plugin_api_msg; +#undef _ + + return 0; +} + +#define vl_msg_name_crc_list +#include <ioam/ip6/ioam_cache_all_api_h.h> +#undef vl_msg_name_crc_list + +static void +setup_message_id_table (ioam_cache_main_t * sm, api_main_t * am) +{ +#define _(id,n,crc) \ + vl_msg_api_add_msg_name_crc (am, #n "_" #crc, id + sm->msg_id_base); + foreach_vl_msg_name_crc_ioam_cache; +#undef _ +} + +static clib_error_t * +set_ioam_cache_command_fn (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + ioam_cache_main_t *em = &ioam_cache_main; + u8 is_disable = 0; + ip6_address_t sr_localsid; + u8 address_set = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "disable")) + is_disable = 1; + else if (!address_set + && unformat (input, "sr_localsid %U", unformat_ip6_address, + &sr_localsid)) + address_set = 1; + else + break; + } + + if (is_disable == 0 && !address_set) + return clib_error_return (0, "Error: SRv6 LocalSID address is mandatory"); + + ioam_cache_ip6_enable_disable (em, &sr_localsid, is_disable); + + return 0; +} + +/* *INDENT_OFF* */ +VLIB_CLI_COMMAND (set_ioam_cache_command, static) = +{ +.path = "set ioam ip6 cache",.short_help = + "set ioam ip6 cache sr_localsid <ip6 address> [disable]",.function = + set_ioam_cache_command_fn}; +/* *INDENT_ON* */ + +#define IOAM_TS_WAIT_FOR_RESPONSES 3 +static clib_error_t * +set_ioam_tunnel_select_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + ioam_cache_main_t *em = &ioam_cache_main; + u8 is_disable = 0; + u8 one_way = 0; + ip6_address_t sr_localsid; + u8 address_set = 0; + u8 no_of_responses = IOAM_TS_WAIT_FOR_RESPONSES; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "disable")) + is_disable = 1; + else if (unformat (input, "rtt")) + one_way = 0; + else if (unformat (input, "oneway")) + one_way = 1; + else if (unformat (input, "wait_for_responses %d", &no_of_responses)) + ; + else if (!address_set + && unformat (input, "sr_localsid %U", unformat_ip6_address, + &sr_localsid)) + address_set = 1; + else + break; + } + if (is_disable == 0 && !address_set) + return clib_error_return (0, + "Error: SRv6 LocalSID address is mandatory to receive response."); + + ioam_tunnel_select_ip6_enable_disable (em, one_way, no_of_responses, + &sr_localsid, is_disable); + + return 0; +} + +/* *INDENT_OFF* */ +VLIB_CLI_COMMAND (set_ioam_cache_ts_command, static) = +{ +.path = "set ioam ip6 sr-tunnel-select",.short_help = + "set ioam ip6 sr-tunnel-select [disable] [oneway|rtt] [wait_for_responses <n|default 3>] \ + [sr_localsid <ip6 address>]",.function = set_ioam_tunnel_select_command_fn}; +/* *INDENT_ON* */ + +static void +ioam_cache_table_print (vlib_main_t * vm, u8 verbose) +{ + ioam_cache_main_t *cm = &ioam_cache_main; + ioam_cache_entry_t *entry = 0; + ioam_cache_ts_entry_t *ts_entry = 0; + int no_of_threads = vec_len (vlib_worker_threads); + int i; + + pool_foreach (entry, cm->ioam_rewrite_pool, ( + { + vlib_cli_output (vm, "%U", + format_ioam_cache_entry, + entry); + })); + + if (cm->ts_stats) + for (i = 0; i < no_of_threads; i++) + { + vlib_cli_output (vm, "Number of entries in thread-%d selection pool: %lu\n \ + (pool found to be full: %lu times)", i, + cm->ts_stats[i].inuse, cm->ts_stats[i].add_failed); + + if (verbose == 1) + vlib_worker_thread_barrier_sync (vm); + pool_foreach (ts_entry, cm->ioam_ts_pool[i], ( + { + vlib_cli_output (vm, + "%U", + format_ioam_cache_ts_entry, + ts_entry, + (u32) + i); + } + )); + vlib_worker_thread_barrier_release (vm); + } + +} + +static clib_error_t * +show_ioam_cache_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + u8 verbose = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "verbose")) + verbose = 1; + } + ioam_cache_table_print (vm, verbose); + + + return 0; +} + +/* *INDENT_OFF* */ +VLIB_CLI_COMMAND (show_ioam_cache_command, static) = +{ +.path = "show ioam ip6 cache",.short_help = + "show ioam ip6 cache [verbose]",.function = show_ioam_cache_command_fn}; +/* *INDENT_ON* */ + +static clib_error_t * +ioam_cache_init (vlib_main_t * vm) +{ + ioam_cache_main_t *em = &ioam_cache_main; + clib_error_t *error = 0; + u8 *name; + u32 cache_node_index = ioam_cache_node.index; + u32 ts_node_index = ioam_cache_ts_node.index; + vlib_node_t *ip6_hbyh_node = NULL, *ip6_hbh_pop_node = NULL, *error_node = + NULL; + + name = format (0, "ioam_cache_%08x%c", api_version, 0); + + memset (&ioam_cache_main, 0, sizeof (ioam_cache_main)); + /* Ask for a correctly-sized block of API message decode slots */ + em->msg_id_base = vl_msg_api_get_msg_ids + ((char *) name, VL_MSG_FIRST_AVAILABLE); + + error = ioam_cache_plugin_api_hookup (vm); + + /* Add our API messages to the global name_crc hash table */ + setup_message_id_table (em, &api_main); + + /* Hook this node to ip6-hop-by-hop */ + ip6_hbyh_node = vlib_get_node_by_name (vm, (u8 *) "ip6-hop-by-hop"); + em->cache_hbh_slot = + vlib_node_add_next (vm, ip6_hbyh_node->index, cache_node_index); + em->ts_hbh_slot = + vlib_node_add_next (vm, ip6_hbyh_node->index, ts_node_index); + + ip6_hbh_pop_node = vlib_get_node_by_name (vm, (u8 *) "ip6-pop-hop-by-hop"); + em->ip6_hbh_pop_node_index = ip6_hbh_pop_node->index; + + error_node = vlib_get_node_by_name (vm, (u8 *) "error-drop"); + em->error_node_index = error_node->index; + em->vlib_main = vm; + + vec_free (name); + + return error; +} + +VLIB_INIT_FUNCTION (ioam_cache_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/ip6/ioam_cache.h b/src/plugins/ioam/ip6/ioam_cache.h new file mode 100644 index 00000000..25a8fb65 --- /dev/null +++ b/src/plugins/ioam/ip6/ioam_cache.h @@ -0,0 +1,903 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_ioam_cache_h__ +#define __included_ioam_cache_h__ + +#include <vnet/vnet.h> +#include <vnet/ip/ip.h> +#include <vnet/ip/ip_packet.h> +#include <vnet/ip/ip4_packet.h> +#include <vnet/ip/ip6_packet.h> +#include <vnet/srv6/sr.h> + +#include <vppinfra/pool.h> +#include <vppinfra/hash.h> +#include <vppinfra/error.h> +#include <vppinfra/elog.h> +#include <vppinfra/bihash_8_8.h> +#include <ioam/analyse/ip6/ip6_ioam_analyse.h> +#include <vppinfra/tw_timer_16t_2w_512sl.h> +/* + * ioam_cache.h + * This header contains routines for caching of ioam header and + * buffer: + * 1 - On application facing node: to cache ioam header recvd + * in request and reattach in response to provide round + * trip path visibility. Since request response matching + * is needed works with TCP and relies on (5 tuples,seq no) + * 2 - On M-Anycast server node: This node replicates requests + * towards multiple anycast service nodes serving anycast + * IP6 address. It evaluates response and forwards the best + * response towards the client of requesting the service. + * Again since request-response matching is needed, works + * with TCP and relies on (5 tuples,seq no) for matching. + * To do this it caches SYN-ACK responses for a short time to + * evaluate multiple responses received before the selected + * SYN-ACK response is forwared and others dropped. + * + * M-Anycast server cache: + * - There is a pool of cache entries per worker thread. + * - Cache entry is created when SYN is received expected + * number of responses are marked based on number of + * SR tunnels for the anycast destination address + * - The pool/thread id and pool index are attached in the + * message as an ioam option for quick look up. + * - When is received SYN-ACK the ioam option containing + * thread id + pool index of the cache entry is used to + * look up cache entry. + * - Cache synchronization: + * - This is achieved by cache entry add/del/update all handled + * by the same worker/main thread + * - Packets from client to threads - syn packets, can be disctributed + * based on incoming interface affinity to the cpu core pinned to + * the thread or a simple sequence number based distribution + * if thread per interface is not scaling + * - Response packets from server towards clients - syn-acks, are + * forced to the same thread that created the cache entry + * using SR and the destination of SR v6 address assigned + * to the core/thread. This adderss is sent as an ioam option + * in the syn that can be then used on the other side to + * populate v6 dst address in the response + * - Timeout: timer wheel per thread is used to track the syn-ack wait + * time. The timer wheel tick is updated via an input node per thread. + * + * Application facing node/Service side cache: + * - Single pool of cache entries. + * - Cache entry is created when SYN is received. Caches the ioam + * header. Hash table entry is created based on 5 tuple and + * TCP seq no to pool index + * - Response SYN-ACK processed by looking up pool index in hash table + * and cache entry in the pool is used to get the ioam header rewrite + * string. Entry is freed from pool and hash table after use. + * - Locking/Synchronization: Currently this functionality is deployed + * with main/single thread only. Hence no locking is used. + * - Deployment: A VPP node per application server servicing anycast + * address is expected. Locking/synchronization needed when the server + * /application facing node is started with multiple worker threads. + * + */ + +/* + * Application facing server side caching: + * Cache entry for ioam header + * Currently caters to TCP and relies on + * TCP - 5 tuples + seqno to cache and reinsert + * ioam header b/n TCP request response + */ +typedef struct +{ + ip6_address_t src_address; + ip6_address_t dst_address; + u16 src_port; + u16 dst_port; + u8 protocol; + u32 seq_no; + ip6_address_t next_hop; + u16 my_address_offset; + u8 *ioam_rewrite_string; +} ioam_cache_entry_t; + +/* + * Cache entry for anycast server selection + * Works for TCP as 5 tuple + sequence number + * is required for request response matching + * max_responses expected is set based on number + * of SR tunnels for the dst_address + * Timeout or all response_received = max_responses + * will clear the entry + * buffer_index index of the response msg vlib buffer + * that is currently the best response + */ +typedef struct +{ + u32 pool_id; + u32 pool_index; + ip6_address_t src_address; + ip6_address_t dst_address; + u16 src_port; + u16 dst_port; + u8 protocol; + u32 seq_no; + u32 buffer_index; + ip6_hop_by_hop_header_t *hbh; //pointer to hbh header in the buffer + u64 created_at; + u8 response_received; + u8 max_responses; + u32 stop_timer_handle; + /** Handle returned from tw_start_timer */ + u32 timer_handle; + /** entry should expire at this clock tick */ + u32 expected_to_expire; +} ioam_cache_ts_entry_t; + +/* + * Per thread tunnel selection cache stats + */ +typedef struct +{ + u64 inuse; + u64 add_failed; +} ioam_cache_ts_pool_stats_t; + +/* Server side: iOAM header caching */ +#define MAX_CACHE_ENTRIES 4096 +/* M-Anycast: Cache for SR tunnel selection */ +#define MAX_CACHE_TS_ENTRIES 1048576 + +#define IOAM_CACHE_TABLE_DEFAULT_HASH_NUM_BUCKETS (4 * 1024) +#define IOAM_CACHE_TABLE_DEFAULT_HASH_MEMORY_SIZE (2<<20) + +typedef struct +{ + /* API message ID base */ + u16 msg_id_base; + + /* Pool of ioam_cache_buffer_t */ + ioam_cache_entry_t *ioam_rewrite_pool; + + /* For steering packets ioam cache entry is followed by + * SR header. This is the SR rewrite template */ + u8 *sr_rewrite_template; + /* The current rewrite string being used */ + u8 *rewrite; + u8 rewrite_pool_index_offset; + ip6_address_t sr_localsid_cache; + + u64 lookup_table_nbuckets; + u64 lookup_table_size; + clib_bihash_8_8_t ioam_rewrite_cache_table; + + /* M-Anycast: Pool of ioam_cache_ts_entry_t per thread */ + ioam_cache_ts_entry_t **ioam_ts_pool; + ioam_cache_ts_pool_stats_t *ts_stats; + /** per thread single-wheel */ + tw_timer_wheel_16t_2w_512sl_t *timer_wheels; + + /* + * Selection criteria: oneway delay: Server to M-Anycast + * or RTT + */ + bool criteria_oneway; + u8 wait_for_responses; + ip6_address_t sr_localsid_ts; + + /* convenience */ + vlib_main_t *vlib_main; + + uword cache_hbh_slot; + uword ts_hbh_slot; + u32 ip6_hbh_pop_node_index; + u32 error_node_index; + u32 cleanup_process_node_index; +} ioam_cache_main_t; + +ioam_cache_main_t ioam_cache_main; + +extern vlib_node_registration_t ioam_cache_node; +extern vlib_node_registration_t ioam_cache_ts_node; + +/* Compute flow hash. We'll use it to select which Sponge to use for this + * flow. And other things. + * ip6_compute_flow_hash in ip6.h doesnt locate tcp/udp when + * ext headers are present. While it could be made to it will be a + * performance hit for ECMP flows. + * HEnce this function here, with L4 information directly input + * Useful when tcp/udp headers are already located in presence of + * ext headers + */ +always_inline u32 +ip6_compute_flow_hash_ext (const ip6_header_t * ip, + u8 protocol, + u16 src_port, + u16 dst_port, flow_hash_config_t flow_hash_config) +{ + u64 a, b, c; + u64 t1, t2; + + t1 = (ip->src_address.as_u64[0] ^ ip->src_address.as_u64[1]); + t1 = (flow_hash_config & IP_FLOW_HASH_SRC_ADDR) ? t1 : 0; + + t2 = (ip->dst_address.as_u64[0] ^ ip->dst_address.as_u64[1]); + t2 = (flow_hash_config & IP_FLOW_HASH_DST_ADDR) ? t2 : 0; + + a = (flow_hash_config & IP_FLOW_HASH_REVERSE_SRC_DST) ? t2 : t1; + b = (flow_hash_config & IP_FLOW_HASH_REVERSE_SRC_DST) ? t1 : t2; + b ^= (flow_hash_config & IP_FLOW_HASH_PROTO) ? protocol : 0; + + t1 = src_port; + t2 = dst_port; + + t1 = (flow_hash_config & IP_FLOW_HASH_SRC_PORT) ? t1 : 0; + t2 = (flow_hash_config & IP_FLOW_HASH_DST_PORT) ? t2 : 0; + + c = (flow_hash_config & IP_FLOW_HASH_REVERSE_SRC_DST) ? + ((t1 << 16) | t2) : ((t2 << 16) | t1); + + hash_mix64 (a, b, c); + return (u32) c; +} + + +/* 2 new ioam E2E options : + * 1. HBH_OPTION_TYPE_IOAM_EDGE_TO_EDGE_ID: IP6 address + * of ioam node that inserted ioam header + * 2. HBH_OPTION_TYPE_IOAM_E2E_CACHE_ID: Pool id and index + * to look up tunnel select cache entry + */ +#define HBH_OPTION_TYPE_IOAM_EDGE_TO_EDGE_ID 30 +#define HBH_OPTION_TYPE_IOAM_E2E_CACHE_ID 31 + +typedef CLIB_PACKED (struct + { + ip6_hop_by_hop_option_t hdr; u8 e2e_type; u8 reserved[5]; + ip6_address_t id; + }) ioam_e2e_id_option_t; + +typedef CLIB_PACKED (struct + { + ip6_hop_by_hop_option_t hdr; u8 e2e_type; u8 pool_id; + u32 pool_index; + }) ioam_e2e_cache_option_t; + +#define IOAM_E2E_ID_OPTION_RND ((sizeof(ioam_e2e_id_option_t) + 7) & ~7) +#define IOAM_E2E_ID_HBH_EXT_LEN (IOAM_E2E_ID_OPTION_RND >> 3) +#define IOAM_E2E_CACHE_OPTION_RND ((sizeof(ioam_e2e_cache_option_t) + 7) & ~7) +#define IOAM_E2E_CACHE_HBH_EXT_LEN (IOAM_E2E_CACHE_OPTION_RND >> 3) + +static inline void +ioam_e2e_id_rewrite_handler (ioam_e2e_id_option_t * e2e_option, + ip6_address_t * address) +{ + e2e_option->id.as_u64[0] = address->as_u64[0]; + e2e_option->id.as_u64[1] = address->as_u64[1]; + +} + +/* Following functions are for the caching of ioam header + * to enable reattaching it for a complete request-response + * message exchange */ +inline static void +ioam_cache_entry_free (ioam_cache_entry_t * entry) +{ + ioam_cache_main_t *cm = &ioam_cache_main; + if (entry) + { + vec_free (entry->ioam_rewrite_string); + memset (entry, 0, sizeof (*entry)); + pool_put (cm->ioam_rewrite_pool, entry); + } +} + +inline static ioam_cache_entry_t * +ioam_cache_entry_cleanup (u32 pool_index) +{ + ioam_cache_main_t *cm = &ioam_cache_main; + ioam_cache_entry_t *entry = 0; + + entry = pool_elt_at_index (cm->ioam_rewrite_pool, pool_index); + ioam_cache_entry_free (entry); + return (0); +} + +inline static ioam_cache_entry_t * +ioam_cache_lookup (ip6_header_t * ip0, u16 src_port, u16 dst_port, u32 seq_no) +{ + ioam_cache_main_t *cm = &ioam_cache_main; + u32 flow_hash = ip6_compute_flow_hash_ext (ip0, ip0->protocol, + src_port, dst_port, + IP_FLOW_HASH_DEFAULT | + IP_FLOW_HASH_REVERSE_SRC_DST); + clib_bihash_kv_8_8_t kv, value; + + kv.key = (u64) flow_hash << 32 | seq_no; + kv.value = 0; + value.key = 0; + value.value = 0; + + if (clib_bihash_search_8_8 (&cm->ioam_rewrite_cache_table, &kv, &value) >= + 0) + { + ioam_cache_entry_t *entry = 0; + + entry = pool_elt_at_index (cm->ioam_rewrite_pool, value.value); + /* match */ + if (ip6_address_compare (&ip0->src_address, &entry->dst_address) == 0 && + ip6_address_compare (&ip0->dst_address, &entry->src_address) == 0 && + entry->src_port == dst_port && + entry->dst_port == src_port && entry->seq_no == seq_no) + { + /* If lookup is successful remove it from the hash */ + clib_bihash_add_del_8_8 (&cm->ioam_rewrite_cache_table, &kv, 0); + return (entry); + } + else + return (0); + + } + return (0); +} + +/* + * Caches ioam hbh header + * Extends the hbh header with option to contain IP6 address of the node + * that caches it + */ +inline static int +ioam_cache_add (vlib_buffer_t * b0, + ip6_header_t * ip0, + u16 src_port, + u16 dst_port, ip6_hop_by_hop_header_t * hbh0, u32 seq_no) +{ + ioam_cache_main_t *cm = &ioam_cache_main; + ioam_cache_entry_t *entry = 0; + u32 rewrite_len = 0, e2e_id_offset = 0; + u32 pool_index = 0; + ioam_e2e_id_option_t *e2e = 0; + + pool_get_aligned (cm->ioam_rewrite_pool, entry, CLIB_CACHE_LINE_BYTES); + memset (entry, 0, sizeof (*entry)); + pool_index = entry - cm->ioam_rewrite_pool; + + clib_memcpy (entry->dst_address.as_u64, ip0->dst_address.as_u64, + sizeof (ip6_address_t)); + clib_memcpy (entry->src_address.as_u64, ip0->src_address.as_u64, + sizeof (ip6_address_t)); + entry->src_port = src_port; + entry->dst_port = dst_port; + entry->seq_no = seq_no; + rewrite_len = ((hbh0->length + 1) << 3); + vec_validate (entry->ioam_rewrite_string, rewrite_len - 1); + e2e = ip6_ioam_find_hbh_option (hbh0, HBH_OPTION_TYPE_IOAM_EDGE_TO_EDGE_ID); + if (e2e) + { + entry->next_hop.as_u64[0] = e2e->id.as_u64[0]; + entry->next_hop.as_u64[1] = e2e->id.as_u64[1]; + } + else + { + return (-1); + } + e2e_id_offset = (u8 *) e2e - (u8 *) hbh0; + /* setup e2e id option to insert v6 address of the node caching it */ + clib_memcpy (entry->ioam_rewrite_string, hbh0, rewrite_len); + hbh0 = (ip6_hop_by_hop_header_t *) entry->ioam_rewrite_string; + + /* suffix rewrite string with e2e ID option */ + e2e = (ioam_e2e_id_option_t *) (entry->ioam_rewrite_string + e2e_id_offset); + ioam_e2e_id_rewrite_handler (e2e, &cm->sr_localsid_cache); + entry->my_address_offset = (u8 *) (&e2e->id) - (u8 *) hbh0; + + /* add it to hash, replacing and freeing any collision for now */ + u32 flow_hash = + ip6_compute_flow_hash_ext (ip0, hbh0->protocol, src_port, dst_port, + IP_FLOW_HASH_DEFAULT); + clib_bihash_kv_8_8_t kv, value; + kv.key = (u64) flow_hash << 32 | seq_no; + kv.value = 0; + if (clib_bihash_search_8_8 (&cm->ioam_rewrite_cache_table, &kv, &value) >= + 0) + { + /* replace */ + ioam_cache_entry_cleanup (value.value); + } + kv.value = pool_index; + clib_bihash_add_del_8_8 (&cm->ioam_rewrite_cache_table, &kv, 1); + return (0); +} + +/* Creates SR rewrite string + * This is appended with ioam header on the server facing + * node. + * This SR header is necessary to attract packets towards + * selected Anycast server. + */ +inline static void +ioam_cache_sr_rewrite_template_create (void) +{ + ioam_cache_main_t *cm = &ioam_cache_main; + ip6_address_t *segments = 0; + ip6_address_t *this_seg = 0; + + /* This nodes address and the original dest will be + * filled when the packet is processed */ + vec_add2 (segments, this_seg, 1); + memset (this_seg, 0xfe, sizeof (ip6_address_t)); + cm->sr_rewrite_template = ip6_sr_compute_rewrite_string_insert (segments); + vec_free (segments); +} + +inline static int +ioam_cache_table_init (vlib_main_t * vm) +{ + ioam_cache_main_t *cm = &ioam_cache_main; + + pool_alloc_aligned (cm->ioam_rewrite_pool, + MAX_CACHE_ENTRIES, CLIB_CACHE_LINE_BYTES); + cm->lookup_table_nbuckets = IOAM_CACHE_TABLE_DEFAULT_HASH_NUM_BUCKETS; + cm->lookup_table_nbuckets = 1 << max_log2 (cm->lookup_table_nbuckets); + cm->lookup_table_size = IOAM_CACHE_TABLE_DEFAULT_HASH_MEMORY_SIZE; + + clib_bihash_init_8_8 (&cm->ioam_rewrite_cache_table, + "ioam rewrite cache table", + cm->lookup_table_nbuckets, cm->lookup_table_size); + /* Create SR rewrite template */ + ioam_cache_sr_rewrite_template_create (); + return (1); +} + +inline static int +ioam_cache_table_destroy (vlib_main_t * vm) +{ + ioam_cache_main_t *cm = &ioam_cache_main; + ioam_cache_entry_t *entry = 0; + /* free pool and hash table */ + clib_bihash_free_8_8 (&cm->ioam_rewrite_cache_table); + pool_foreach (entry, cm->ioam_rewrite_pool, ( + { + ioam_cache_entry_free (entry); + })); + pool_free (cm->ioam_rewrite_pool); + cm->ioam_rewrite_pool = 0; + vec_free (cm->sr_rewrite_template); + cm->sr_rewrite_template = 0; + return (0); +} + +inline static u8 * +format_ioam_cache_entry (u8 * s, va_list * args) +{ + ioam_cache_entry_t *e = va_arg (*args, ioam_cache_entry_t *); + ioam_cache_main_t *cm = &ioam_cache_main; + int rewrite_len = vec_len (e->ioam_rewrite_string); + + s = format (s, "%d: %U:%d to %U:%d seq_no %lu\n", + (e - cm->ioam_rewrite_pool), + format_ip6_address, &e->src_address, + e->src_port, + format_ip6_address, &e->dst_address, e->dst_port, e->seq_no); + + if (rewrite_len) + { + s = format (s, " %U", + format_ip6_hop_by_hop_ext_hdr, + (ip6_hop_by_hop_header_t *) e->ioam_rewrite_string, + rewrite_len - 1); + } + return s; +} + +void ioam_cache_ts_timer_node_enable (vlib_main_t * vm, u8 enable); + +#define IOAM_CACHE_TS_TIMEOUT 1.0 //SYN timeout 1 sec +#define IOAM_CACHE_TS_TICK 100e-3 +/* Timer delays as multiples of 100ms */ +#define IOAM_CACHE_TS_TIMEOUT_TICKS IOAM_CACHE_TS_TICK*9 +#define TIMER_HANDLE_INVALID ((u32) ~0) + + +void expired_cache_ts_timer_callback (u32 * expired_timers); + +/* + * Following functions are to manage M-Anycast server selection + * cache + * There is a per worker thread pool to create a cache entry + * for a TCP SYN received. TCP SYN-ACK contians ioam header + * with HBH_OPTION_TYPE_IOAM_E2E_CACHE_ID option to point to the + * entry. + */ +inline static int +ioam_cache_ts_table_init (vlib_main_t * vm) +{ + ioam_cache_main_t *cm = &ioam_cache_main; + int no_of_threads = vec_len (vlib_worker_threads); + int i; + + vec_validate_aligned (cm->ioam_ts_pool, no_of_threads - 1, + CLIB_CACHE_LINE_BYTES); + vec_validate_aligned (cm->ts_stats, no_of_threads - 1, + CLIB_CACHE_LINE_BYTES); + vec_validate_aligned (cm->timer_wheels, no_of_threads - 1, + CLIB_CACHE_LINE_BYTES); + cm->lookup_table_nbuckets = IOAM_CACHE_TABLE_DEFAULT_HASH_NUM_BUCKETS; + cm->lookup_table_nbuckets = 1 << max_log2 (cm->lookup_table_nbuckets); + cm->lookup_table_size = IOAM_CACHE_TABLE_DEFAULT_HASH_MEMORY_SIZE; + for (i = 0; i < no_of_threads; i++) + { + pool_alloc_aligned (cm->ioam_ts_pool[i], + MAX_CACHE_TS_ENTRIES, CLIB_CACHE_LINE_BYTES); + memset (&cm->ts_stats[i], 0, sizeof (ioam_cache_ts_pool_stats_t)); + tw_timer_wheel_init_16t_2w_512sl (&cm->timer_wheels[i], + expired_cache_ts_timer_callback, + IOAM_CACHE_TS_TICK + /* timer period 100ms */ , + 10e4); + cm->timer_wheels[i].last_run_time = vlib_time_now (vm); + } + ioam_cache_ts_timer_node_enable (vm, 1); + return (1); +} + +always_inline void +ioam_cache_ts_timer_set (ioam_cache_main_t * cm, + ioam_cache_ts_entry_t * entry, u32 interval) +{ + entry->timer_handle + = tw_timer_start_16t_2w_512sl (&cm->timer_wheels[entry->pool_id], + entry->pool_index, 1, interval); +} + +always_inline void +ioam_cache_ts_timer_reset (ioam_cache_main_t * cm, + ioam_cache_ts_entry_t * entry) +{ + tw_timer_stop_16t_2w_512sl (&cm->timer_wheels[entry->pool_id], + entry->timer_handle); + entry->timer_handle = TIMER_HANDLE_INVALID; +} + +inline static void +ioam_cache_ts_entry_free (u32 thread_id, + ioam_cache_ts_entry_t * entry, u32 node_index) +{ + ioam_cache_main_t *cm = &ioam_cache_main; + vlib_main_t *vm = cm->vlib_main; + vlib_frame_t *nf = 0; + u32 *to_next; + + if (entry) + { + if (entry->hbh != 0) + { + nf = vlib_get_frame_to_node (vm, node_index); + nf->n_vectors = 0; + to_next = vlib_frame_vector_args (nf); + nf->n_vectors = 1; + to_next[0] = entry->buffer_index; + vlib_put_frame_to_node (vm, node_index, nf); + } + pool_put (cm->ioam_ts_pool[thread_id], entry); + cm->ts_stats[thread_id].inuse--; + memset (entry, 0, sizeof (*entry)); + } +} + +inline static int +ioam_cache_ts_table_destroy (vlib_main_t * vm) +{ + ioam_cache_main_t *cm = &ioam_cache_main; + ioam_cache_ts_entry_t *entry = 0; + int no_of_threads = vec_len (vlib_worker_threads); + int i; + + /* free pool and hash table */ + for (i = 0; i < no_of_threads; i++) + { + pool_foreach (entry, cm->ioam_ts_pool[i], ( + { + ioam_cache_ts_entry_free (i, + entry, + cm->error_node_index); + } + )); + pool_free (cm->ioam_ts_pool[i]); + cm->ioam_ts_pool = 0; + tw_timer_wheel_free_16t_2w_512sl (&cm->timer_wheels[i]); + } + vec_free (cm->ioam_ts_pool); + return (0); +} + +inline static int +ioam_cache_ts_entry_cleanup (u32 thread_id, u32 pool_index) +{ + ioam_cache_main_t *cm = &ioam_cache_main; + ioam_cache_ts_entry_t *entry = 0; + + entry = pool_elt_at_index (cm->ioam_ts_pool[thread_id], pool_index); + ioam_cache_ts_entry_free (thread_id, entry, cm->error_node_index); + return (0); +} + +/* + * Caches buffer for ioam SR tunnel select for Anycast service + */ +inline static int +ioam_cache_ts_add (ip6_header_t * ip0, + u16 src_port, + u16 dst_port, + u32 seq_no, + u8 max_responses, u64 now, u32 thread_id, u32 * pool_index) +{ + ioam_cache_main_t *cm = &ioam_cache_main; + ioam_cache_ts_entry_t *entry = 0; + + if (cm->ts_stats[thread_id].inuse == MAX_CACHE_TS_ENTRIES) + { + cm->ts_stats[thread_id].add_failed++; + return (-1); + } + + pool_get_aligned (cm->ioam_ts_pool[thread_id], entry, + CLIB_CACHE_LINE_BYTES); + memset (entry, 0, sizeof (*entry)); + *pool_index = entry - cm->ioam_ts_pool[thread_id]; + + clib_memcpy (entry->dst_address.as_u64, ip0->dst_address.as_u64, + sizeof (ip6_address_t)); + clib_memcpy (entry->src_address.as_u64, ip0->src_address.as_u64, + sizeof (ip6_address_t)); + entry->src_port = src_port; + entry->dst_port = dst_port; + entry->seq_no = seq_no; + entry->response_received = 0; + entry->max_responses = max_responses; + entry->created_at = now; + entry->hbh = 0; + entry->buffer_index = 0; + entry->pool_id = thread_id; + entry->pool_index = *pool_index; + ioam_cache_ts_timer_set (cm, entry, IOAM_CACHE_TS_TIMEOUT); + cm->ts_stats[thread_id].inuse++; + return (0); +} + +inline static void +ioam_cache_ts_send (u32 thread_id, i32 pool_index) +{ + ioam_cache_main_t *cm = &ioam_cache_main; + ioam_cache_ts_entry_t *entry = 0; + + entry = pool_elt_at_index (cm->ioam_ts_pool[thread_id], pool_index); + if (!pool_is_free (cm->ioam_ts_pool[thread_id], entry) && entry) + { + /* send and free pool entry */ + ioam_cache_ts_entry_free (thread_id, entry, cm->ip6_hbh_pop_node_index); + } +} + +inline static void +ioam_cache_ts_check_and_send (u32 thread_id, i32 pool_index) +{ + ioam_cache_main_t *cm = &ioam_cache_main; + ioam_cache_ts_entry_t *entry = 0; + entry = pool_elt_at_index (cm->ioam_ts_pool[thread_id], pool_index); + if (entry && entry->hbh) + { + if (entry->response_received == entry->max_responses || + entry->created_at + IOAM_CACHE_TS_TIMEOUT <= + vlib_time_now (cm->vlib_main)) + { + ioam_cache_ts_timer_reset (cm, entry); + ioam_cache_ts_send (thread_id, pool_index); + } + } +} + +inline static int +ioam_cache_ts_update (u32 thread_id, + i32 pool_index, + u32 buffer_index, ip6_hop_by_hop_header_t * hbh) +{ + ioam_cache_main_t *cm = &ioam_cache_main; + ioam_cache_ts_entry_t *entry = 0; + vlib_main_t *vm = cm->vlib_main; + vlib_frame_t *nf = 0; + u32 *to_next; + + entry = pool_elt_at_index (cm->ioam_ts_pool[thread_id], pool_index); + if (!pool_is_free (cm->ioam_ts_pool[thread_id], entry) && entry) + { + /* drop existing buffer */ + if (entry->hbh != 0) + { + nf = vlib_get_frame_to_node (vm, cm->error_node_index); + nf->n_vectors = 0; + to_next = vlib_frame_vector_args (nf); + nf->n_vectors = 1; + to_next[0] = entry->buffer_index; + vlib_put_frame_to_node (vm, cm->error_node_index, nf); + } + /* update */ + entry->buffer_index = buffer_index; + entry->hbh = hbh; + /* check and send */ + ioam_cache_ts_check_and_send (thread_id, pool_index); + return (0); + } + return (-1); +} + +/* + * looks up the entry based on the e2e option pool index + * result = 0 found the entry + * result < 0 indicates failture to find an entry + */ +inline static int +ioam_cache_ts_lookup (ip6_header_t * ip0, + u8 protocol, + u16 src_port, + u16 dst_port, + u32 seq_no, + ip6_hop_by_hop_header_t ** hbh, + u32 * pool_index, u8 * thread_id, u8 response_seen) +{ + ioam_cache_main_t *cm = &ioam_cache_main; + ip6_hop_by_hop_header_t *hbh0 = 0; + ioam_e2e_cache_option_t *e2e = 0; + + hbh0 = (ip6_hop_by_hop_header_t *) (ip0 + 1); + e2e = + (ioam_e2e_cache_option_t *) ((u8 *) hbh0 + cm->rewrite_pool_index_offset); + if ((u8 *) e2e < ((u8 *) hbh0 + ((hbh0->length + 1) << 3)) + && e2e->hdr.type == HBH_OPTION_TYPE_IOAM_E2E_CACHE_ID) + { + ioam_cache_ts_entry_t *entry = 0; + *pool_index = e2e->pool_index; + *thread_id = e2e->pool_id; + entry = pool_elt_at_index (cm->ioam_ts_pool[*thread_id], *pool_index); + /* match */ + if (entry && + ip6_address_compare (&ip0->src_address, &entry->dst_address) == 0 && + ip6_address_compare (&ip0->dst_address, &entry->src_address) == 0 && + entry->src_port == dst_port && + entry->dst_port == src_port && entry->seq_no == seq_no) + { + *hbh = entry->hbh; + entry->response_received += response_seen; + return (0); + } + else if (entry) + { + return (-1); + } + } + return (-1); +} + +inline static u8 * +format_ioam_cache_ts_entry (u8 * s, va_list * args) +{ + ioam_cache_ts_entry_t *e = va_arg (*args, ioam_cache_ts_entry_t *); + u32 thread_id = va_arg (*args, u32); + ioam_cache_main_t *cm = &ioam_cache_main; + ioam_e2e_id_option_t *e2e = 0; + vlib_main_t *vm = cm->vlib_main; + clib_time_t *ct = &vm->clib_time; + + if (!e) + goto end; + + if (e->hbh) + { + e2e = + ip6_ioam_find_hbh_option (e->hbh, + HBH_OPTION_TYPE_IOAM_EDGE_TO_EDGE_ID); + + s = + format (s, + "%d: %U:%d to %U:%d seq_no %u buffer %u %U \n\t\tCreated at %U Received %d\n", + (e - cm->ioam_ts_pool[thread_id]), format_ip6_address, + &e->src_address, e->src_port, format_ip6_address, + &e->dst_address, e->dst_port, e->seq_no, e->buffer_index, + format_ip6_address, e2e ? &e2e->id : 0, format_time_interval, + "h:m:s:u", + (e->created_at - + vm->cpu_time_main_loop_start) * ct->seconds_per_clock, + e->response_received); + } + else + { + s = + format (s, + "%d: %U:%d to %U:%d seq_no %u Buffer %u \n\t\tCreated at %U Received %d\n", + (e - cm->ioam_ts_pool[thread_id]), format_ip6_address, + &e->src_address, e->src_port, format_ip6_address, + &e->dst_address, e->dst_port, e->seq_no, e->buffer_index, + format_time_interval, "h:m:s:u", + (e->created_at - + vm->cpu_time_main_loop_start) * ct->seconds_per_clock, + e->response_received); + } + +end: + return s; +} + +/* + * Get extended rewrite string for iOAM data in v6 + * This makes space for an e2e options to carry cache pool info + * and manycast server address. + * It set the rewrite string per configs in ioam ip6 + new option + * for cache along with offset to the option to populate cache + * pool id and index + */ +static inline int +ip6_ioam_ts_cache_set_rewrite (void) +{ + ip6_hop_by_hop_ioam_main_t *hm = &ip6_hop_by_hop_ioam_main; + ioam_cache_main_t *cm = &ioam_cache_main; + ip6_hop_by_hop_header_t *hbh; + u32 rewrite_len = 0; + ioam_e2e_cache_option_t *e2e = 0; + ioam_e2e_id_option_t *e2e_id = 0; + + vec_free (cm->rewrite); + ip6_ioam_set_rewrite (&(cm->rewrite), hm->has_trace_option, + hm->has_pot_option, hm->has_seqno_option); + hbh = (ip6_hop_by_hop_header_t *) cm->rewrite; + rewrite_len = ((hbh->length + 1) << 3); + vec_validate (cm->rewrite, + rewrite_len - 1 + IOAM_E2E_CACHE_OPTION_RND + + IOAM_E2E_ID_OPTION_RND); + hbh = (ip6_hop_by_hop_header_t *) cm->rewrite; + /* setup e2e id option to insert pool id and index of the node caching it */ + hbh->length += IOAM_E2E_CACHE_HBH_EXT_LEN + IOAM_E2E_ID_HBH_EXT_LEN; + cm->rewrite_pool_index_offset = rewrite_len; + e2e = (ioam_e2e_cache_option_t *) (cm->rewrite + rewrite_len); + e2e->hdr.type = HBH_OPTION_TYPE_IOAM_E2E_CACHE_ID + | HBH_OPTION_TYPE_SKIP_UNKNOWN; + e2e->hdr.length = sizeof (ioam_e2e_cache_option_t) - + sizeof (ip6_hop_by_hop_option_t); + e2e->e2e_type = 2; + e2e_id = + (ioam_e2e_id_option_t *) ((u8 *) e2e + sizeof (ioam_e2e_cache_option_t)); + e2e_id->hdr.type = + HBH_OPTION_TYPE_IOAM_EDGE_TO_EDGE_ID | HBH_OPTION_TYPE_SKIP_UNKNOWN; + e2e_id->hdr.length = + sizeof (ioam_e2e_id_option_t) - sizeof (ip6_hop_by_hop_option_t); + e2e_id->e2e_type = 1; + + return (0); +} + +static inline int +ip6_ioam_ts_cache_cleanup_rewrite (void) +{ + ioam_cache_main_t *cm = &ioam_cache_main; + + vec_free (cm->rewrite); + cm->rewrite = 0; + cm->rewrite_pool_index_offset = 0; + return (0); +} +#endif /* __included_ioam_cache_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/ip6/ioam_cache_all_api_h.h b/src/plugins/ioam/ip6/ioam_cache_all_api_h.h new file mode 100644 index 00000000..61272a51 --- /dev/null +++ b/src/plugins/ioam/ip6/ioam_cache_all_api_h.h @@ -0,0 +1,16 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* Include the generated file, see BUILT_SOURCES in Makefile.am */ +#include <ioam/ip6/ioam_cache.api.h> diff --git a/src/plugins/ioam/ip6/ioam_cache_msg_enum.h b/src/plugins/ioam/ip6/ioam_cache_msg_enum.h new file mode 100644 index 00000000..8afd067b --- /dev/null +++ b/src/plugins/ioam/ip6/ioam_cache_msg_enum.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef included_ioam_cache_msg_enum_h +#define included_ioam_cache_msg_enum_h + +#include <vppinfra/byte_order.h> + +#define vl_msg_id(n,h) n, +typedef enum { +#include <ioam/ip6/ioam_cache_all_api_h.h> + /* We'll want to know how many messages IDs we need... */ + VL_MSG_FIRST_AVAILABLE, +} vl_msg_id_t; +#undef vl_msg_id + +#endif /* included_ioam_cache_msg_enum_h */ diff --git a/src/plugins/ioam/ip6/ioam_cache_node.c b/src/plugins/ioam/ip6/ioam_cache_node.c new file mode 100644 index 00000000..dd27e127 --- /dev/null +++ b/src/plugins/ioam/ip6/ioam_cache_node.c @@ -0,0 +1,421 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * This file implements caching of ioam header and reattaching + * it in response message by performing request-response matching. + * Works for TCP SYN/SYN-ACK. + * This feature is used for anycast server selection. + * ioam data thus cached is used to measure and get complete round trip + * network path to help in server selection. + * There are 2 graph nodes defined to : + * 1. process packets that contain iOAM header and cache it + * 2. process TCP SYN-ACKs and reattach ioam header from the + * cache corresponding to TCP-SYN + * These graph nodes are attached to the vnet graph based on + * ioam cache and classifier configs. + * e.g. + * If db06::06 is the anycast service IP6 address: + * + * set ioam ip6 cache + * + * Apply this classifier on interface where requests for anycast service are received: + * classify session acl-hit-next ip6-node ip6-lookup table-index 0 match l3 ip6 dst db06::06 + * ioam-decap anycast <<< ioam-decap is hooked to cache when set ioam ip6 cache is enabled + * + * Apply this classifier on interface where responses from anycast service are received: + * classify session acl-hit-next ip6-node ip6-add-from-cache-hop-by-hop table-index 0 match l3 + * ip6 src db06::06 ioam-encap anycast-response + * + */ +#include <vlib/vlib.h> +#include <vnet/vnet.h> +#include <vnet/pg/pg.h> +#include <vppinfra/error.h> +#include <vnet/ip/ip.h> +#include <ioam/ip6/ioam_cache.h> +#include <vnet/ip/ip6_hop_by_hop.h> +#include <vnet/ip/ip6_hop_by_hop_packet.h> + +typedef struct +{ + u32 next_index; + u32 flow_label; +} cache_trace_t; + +/* packet trace format function */ +static u8 * +format_cache_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + cache_trace_t *t = va_arg (*args, cache_trace_t *); + + s = format (s, "CACHE: flow_label %d, next index %d", + t->flow_label, t->next_index); + return s; +} + +#define foreach_cache_error \ +_(RECORDED, "ip6 iOAM headers cached") + +typedef enum +{ +#define _(sym,str) CACHE_ERROR_##sym, + foreach_cache_error +#undef _ + CACHE_N_ERROR, +} cache_error_t; + +static char *cache_error_strings[] = { +#define _(sym,string) string, + foreach_cache_error +#undef _ +}; + +typedef enum +{ + IOAM_CACHE_NEXT_POP_HBYH, + IOAM_CACHE_N_NEXT, +} cache_next_t; + +static uword +ip6_ioam_cache_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + u32 n_left_from, *from, *to_next; + cache_next_t next_index; + u32 recorded = 0; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + // TODO: Dual loop + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *p0; + u32 next0 = IOAM_CACHE_NEXT_POP_HBYH; + ip6_header_t *ip0; + ip6_hop_by_hop_header_t *hbh0; + tcp_header_t *tcp0; + u32 tcp_offset0; + + /* speculatively enqueue p0 to the current next frame */ + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + p0 = vlib_get_buffer (vm, bi0); + ip0 = vlib_buffer_get_current (p0); + if (IP_PROTOCOL_TCP == + ip6_locate_header (p0, ip0, IP_PROTOCOL_TCP, &tcp_offset0)) + { + tcp0 = (tcp_header_t *) ((u8 *) ip0 + tcp_offset0); + if ((tcp0->flags & TCP_FLAG_SYN) == TCP_FLAG_SYN && + (tcp0->flags & TCP_FLAG_ACK) == 0) + { + /* Cache the ioam hbh header */ + hbh0 = (ip6_hop_by_hop_header_t *) (ip0 + 1); + if (0 == ioam_cache_add (p0, + ip0, + clib_net_to_host_u16 + (tcp0->src_port), + clib_net_to_host_u16 + (tcp0->dst_port), hbh0, + clib_net_to_host_u32 + (tcp0->seq_number) + 1)) + { + recorded++; + } + } + } + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE))) + { + if (p0->flags & VLIB_BUFFER_IS_TRACED) + { + cache_trace_t *t = + vlib_add_trace (vm, node, p0, sizeof (*t)); + t->flow_label = + clib_net_to_host_u32 + (ip0->ip_version_traffic_class_and_flow_label); + t->next_index = next0; + } + } + /* verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + vlib_node_increment_counter (vm, ioam_cache_node.index, + CACHE_ERROR_RECORDED, recorded); + return frame->n_vectors; +} + +/* + * Node for IP6 iOAM header cache + */ +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (ioam_cache_node) = +{ + .function = ip6_ioam_cache_node_fn, + .name = "ip6-ioam-cache", + .vector_size = sizeof (u32), + .format_trace = format_cache_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN (cache_error_strings), + .error_strings = cache_error_strings, + .n_next_nodes = IOAM_CACHE_N_NEXT, + /* edit / add dispositions here */ + .next_nodes = + { + [IOAM_CACHE_NEXT_POP_HBYH] = "ip6-pop-hop-by-hop" + }, +}; +/* *INDENT-ON* */ + +typedef struct +{ + u32 next_index; +} ip6_add_from_cache_hbh_trace_t; + +/* packet trace format function */ +static u8 * +format_ip6_add_from_cache_hbh_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + ip6_add_from_cache_hbh_trace_t *t = va_arg (*args, + ip6_add_from_cache_hbh_trace_t + *); + + s = format (s, "IP6_ADD_FROM_CACHE_HBH: next index %d", t->next_index); + return s; +} + +vlib_node_registration_t ip6_add_from_cache_hbh_node; + +#define foreach_ip6_add_from_cache_hbh_error \ +_(PROCESSED, "Pkts w/ added ip6 hop-by-hop options") + +typedef enum +{ +#define _(sym,str) IP6_ADD_FROM_CACHE_HBH_ERROR_##sym, + foreach_ip6_add_from_cache_hbh_error +#undef _ + IP6_ADD_FROM_CACHE_HBH_N_ERROR, +} ip6_add_from_cache_hbh_error_t; + +static char *ip6_add_from_cache_hbh_error_strings[] = { +#define _(sym,string) string, + foreach_ip6_add_from_cache_hbh_error +#undef _ +}; + +#define foreach_ip6_ioam_cache_input_next \ + _(IP6_LOOKUP, "ip6-lookup") \ + _(DROP, "error-drop") + +typedef enum +{ +#define _(s,n) IP6_IOAM_CACHE_INPUT_NEXT_##s, + foreach_ip6_ioam_cache_input_next +#undef _ + IP6_IOAM_CACHE_INPUT_N_NEXT, +} ip6_ioam_cache_input_next_t; + + +static uword +ip6_add_from_cache_hbh_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + ioam_cache_main_t *cm = &ioam_cache_main; + u32 n_left_from, *from, *to_next; + ip_lookup_next_t next_index; + u32 processed = 0; + u8 *rewrite = 0; + u32 rewrite_len = 0; + u32 sr_rewrite_len = vec_len (cm->sr_rewrite_template); + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + // TODO: Dual loop + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + u32 next0; + ip6_header_t *ip0; + ip6_hop_by_hop_header_t *hbh0; + ip6_sr_header_t *srh0 = 0; + u64 *copy_src0, *copy_dst0; + u16 new_l0; + tcp_header_t *tcp0; + u32 tcp_offset0; + ioam_cache_entry_t *entry = 0; + + next0 = IP6_IOAM_CACHE_INPUT_NEXT_IP6_LOOKUP; + /* speculatively enqueue b0 to the current next frame */ + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + ip0 = vlib_buffer_get_current (b0); + if (IP_PROTOCOL_TCP != + ip6_locate_header (b0, ip0, IP_PROTOCOL_TCP, &tcp_offset0)) + { + goto TRACE0; + } + tcp0 = (tcp_header_t *) ((u8 *) ip0 + tcp_offset0); + if (((tcp0->flags & TCP_FLAG_SYN) == TCP_FLAG_SYN && + (tcp0->flags & TCP_FLAG_ACK) == TCP_FLAG_ACK) || + (tcp0->flags & TCP_FLAG_RST) == TCP_FLAG_RST) + { + if (0 != (entry = ioam_cache_lookup (ip0, + clib_net_to_host_u16 + (tcp0->src_port), + clib_net_to_host_u16 + (tcp0->dst_port), + clib_net_to_host_u32 + (tcp0->ack_number)))) + { + rewrite = entry->ioam_rewrite_string; + rewrite_len = vec_len (rewrite); + } + else + { + next0 = IP6_IOAM_CACHE_INPUT_NEXT_DROP; + goto TRACE0; + } + } + else + goto TRACE0; + + + /* Copy the ip header left by the required amount */ + copy_dst0 = (u64 *) (((u8 *) ip0) - (rewrite_len + sr_rewrite_len)); + copy_src0 = (u64 *) ip0; + + copy_dst0[0] = copy_src0[0]; + copy_dst0[1] = copy_src0[1]; + copy_dst0[2] = copy_src0[2]; + copy_dst0[3] = copy_src0[3]; + copy_dst0[4] = copy_src0[4]; + vlib_buffer_advance (b0, -(word) (rewrite_len + sr_rewrite_len)); + ip0 = vlib_buffer_get_current (b0); + + hbh0 = (ip6_hop_by_hop_header_t *) (ip0 + 1); + srh0 = (ip6_sr_header_t *) ((u8 *) hbh0 + rewrite_len); + /* $$$ tune, rewrite_len is a multiple of 8 */ + clib_memcpy (hbh0, rewrite, rewrite_len); + clib_memcpy (srh0, cm->sr_rewrite_template, sr_rewrite_len); + /* Copy dst address into the DA slot in the segment list */ + clib_memcpy (srh0->segments, ip0->dst_address.as_u64, + sizeof (ip6_address_t)); + /* Rewrite the ip6 dst address with the first hop */ + clib_memcpy (ip0->dst_address.as_u64, entry->next_hop.as_u64, + sizeof (ip6_address_t)); + clib_memcpy (&srh0->segments[1], + (u8 *) hbh0 + entry->my_address_offset, + sizeof (ip6_address_t)); + ioam_cache_entry_free (entry); + + /* Patch the protocol chain, insert the h-b-h (type 0) header */ + srh0->protocol = ip0->protocol; + hbh0->protocol = IPPROTO_IPV6_ROUTE; + ip0->protocol = 0; + new_l0 = + clib_net_to_host_u16 (ip0->payload_length) + rewrite_len + + sr_rewrite_len; + ip0->payload_length = clib_host_to_net_u16 (new_l0); + processed++; + TRACE0: + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) + && (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + ip6_add_from_cache_hbh_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->next_index = next0; + } + + /* verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + vlib_node_increment_counter (vm, ip6_add_from_cache_hbh_node.index, + IP6_ADD_FROM_CACHE_HBH_ERROR_PROCESSED, + processed); + return frame->n_vectors; +} +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (ip6_add_from_cache_hbh_node) = +{ + .function = ip6_add_from_cache_hbh_node_fn, + .name = "ip6-add-from-cache-hop-by-hop", + .vector_size = sizeof (u32), + .format_trace = format_ip6_add_from_cache_hbh_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN (ip6_add_from_cache_hbh_error_strings), + .error_strings = ip6_add_from_cache_hbh_error_strings, + /* See ip/lookup.h */ + .n_next_nodes = IP6_IOAM_CACHE_INPUT_N_NEXT, + .next_nodes = + { +#define _(s,n) [IP6_IOAM_CACHE_INPUT_NEXT_##s] = n, + foreach_ip6_ioam_cache_input_next +#undef _ + }, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (ip6_add_from_cache_hbh_node, + ip6_add_from_cache_hbh_node_fn) +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/ip6/ioam_cache_tunnel_select_node.c b/src/plugins/ioam/ip6/ioam_cache_tunnel_select_node.c new file mode 100644 index 00000000..79ee58ec --- /dev/null +++ b/src/plugins/ioam/ip6/ioam_cache_tunnel_select_node.c @@ -0,0 +1,768 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ioam_cache_tunnel_select_node.c + * This file implements anycast server selection using ioam data + * attached to anycast service selection. + * Anycast service is reachable via multiple servers reachable + * over SR tunnels. + * Works with TCP Anycast application. + * Cache entry is created when TCP SYN is received for anycast destination. + * Response TCP SYN ACKs for anycast service is compared and selected + * response is forwarded. + * The functionality is introduced via graph nodes that are hooked into + * vnet graph via classifier configs like below: + * + * Enable anycast service selection: + * set ioam ip6 sr-tunnel-select oneway + * + * Enable following classifier on the anycast service client facing interface + * e.g. anycast service is db06::06 then: + * classify session acl-hit-next ip6-node ip6-add-syn-hop-by-hop table-index 0 match l3 + * ip6 dst db06::06 ioam-encap anycast + * + * Enable following classifier on the interfaces facing the server of anycast service: + * classify session acl-hit-next ip6-node ip6-lookup table-index 0 match l3 + * ip6 src db06::06 ioam-decap anycast + * + */ +#include <vlib/vlib.h> +#include <vnet/vnet.h> +#include <vnet/pg/pg.h> +#include <vppinfra/error.h> +#include <vnet/ip/ip.h> +#include <vnet/srv6/sr.h> +#include <ioam/ip6/ioam_cache.h> +#include <vnet/ip/ip6_hop_by_hop.h> +#include <vnet/ip/ip6_hop_by_hop_packet.h> + +typedef struct +{ + u32 next_index; + u32 flow_label; +} cache_ts_trace_t; + +/* packet trace format function */ +static u8 * +format_cache_ts_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + cache_ts_trace_t *t = va_arg (*args, cache_ts_trace_t *); + + s = format (s, "CACHE: flow_label %d, next index %d", + t->flow_label, t->next_index); + return s; +} + +#define foreach_cache_ts_error \ +_(RECORDED, "ip6 iOAM headers cached") + +typedef enum +{ +#define _(sym,str) CACHE_TS_ERROR_##sym, + foreach_cache_ts_error +#undef _ + CACHE_TS_N_ERROR, +} cache_ts_error_t; + +static char *cache_ts_error_strings[] = { +#define _(sym,string) string, + foreach_cache_ts_error +#undef _ +}; + +typedef enum +{ + IOAM_CACHE_TS_NEXT_POP_HBYH, + IOAM_CACHE_TS_ERROR_NEXT_DROP, + IOAM_CACHE_TS_N_NEXT, +} cache_ts_next_t; + +static uword +ip6_ioam_cache_ts_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + ioam_cache_main_t *cm = &ioam_cache_main; + u32 n_left_from, *from, *to_next; + cache_ts_next_t next_index; + u32 recorded = 0; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + // TODO: dual loop + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *p0; + u32 next0 = IOAM_CACHE_TS_NEXT_POP_HBYH; + ip6_header_t *ip0; + ip6_hop_by_hop_header_t *hbh0, *hbh_cmp; + tcp_header_t *tcp0; + u32 tcp_offset0; + u32 cache_ts_index = 0; + u8 cache_thread_id = 0; + int result = 0; + int skip = 0; + + bi0 = from[0]; + from += 1; + n_left_from -= 1; + + p0 = vlib_get_buffer (vm, bi0); + ip0 = vlib_buffer_get_current (p0); + if (IP_PROTOCOL_TCP == + ip6_locate_header (p0, ip0, IP_PROTOCOL_TCP, &tcp_offset0)) + { + tcp0 = (tcp_header_t *) ((u8 *) ip0 + tcp_offset0); + if ((tcp0->flags & TCP_FLAG_SYN) == TCP_FLAG_SYN && + (tcp0->flags & TCP_FLAG_ACK) == TCP_FLAG_ACK) + { + /* Look up and compare */ + hbh0 = (ip6_hop_by_hop_header_t *) (ip0 + 1); + + if (0 == ioam_cache_ts_lookup (ip0, + hbh0->protocol, + clib_net_to_host_u16 + (tcp0->src_port), + clib_net_to_host_u16 + (tcp0->dst_port), + clib_net_to_host_u32 + (tcp0->ack_number), &hbh_cmp, + &cache_ts_index, + &cache_thread_id, 1)) + { + /* response seen */ + result = -1; + if (hbh_cmp) + result = + ip6_ioam_analyse_compare_path_delay (hbh0, hbh_cmp, + cm->criteria_oneway); + if (result >= 0) + { + /* current syn/ack is worse than the earlier: Drop */ + next0 = IOAM_CACHE_TS_ERROR_NEXT_DROP; + /* Check if all responses are received or time has exceeded + send cached response if yes */ + ioam_cache_ts_check_and_send (cache_thread_id, + cache_ts_index); + } + else + { + /* Update cache with this buffer */ + /* If successfully updated then skip sending it */ + if (0 == + (result = + ioam_cache_ts_update (cache_thread_id, + cache_ts_index, bi0, + hbh0))) + { + skip = 1; + } + else + next0 = IOAM_CACHE_TS_ERROR_NEXT_DROP; + } + } + else + { + next0 = IOAM_CACHE_TS_ERROR_NEXT_DROP; + } + } + else if ((tcp0->flags & TCP_FLAG_RST) == TCP_FLAG_RST) + { + /* Look up and compare */ + hbh0 = (ip6_hop_by_hop_header_t *) (ip0 + 1); + if (0 == ioam_cache_ts_lookup (ip0, hbh0->protocol, clib_net_to_host_u16 (tcp0->src_port), clib_net_to_host_u16 (tcp0->dst_port), clib_net_to_host_u32 (tcp0->ack_number), &hbh_cmp, &cache_ts_index, &cache_thread_id, 1)) //response seen + { + next0 = IOAM_CACHE_TS_ERROR_NEXT_DROP; + if (hbh_cmp) + ioam_cache_ts_check_and_send (cache_thread_id, + cache_ts_index); + } + + } + } + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE))) + { + if (p0->flags & VLIB_BUFFER_IS_TRACED) + { + cache_ts_trace_t *t = + vlib_add_trace (vm, node, p0, sizeof (*t)); + t->flow_label = + clib_net_to_host_u32 + (ip0->ip_version_traffic_class_and_flow_label); + t->next_index = next0; + } + } + /* verify speculative enqueue, maybe switch current next frame */ + if (!skip) + { + to_next[0] = bi0; + to_next += 1; + n_left_to_next -= 1; + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + vlib_node_increment_counter (vm, ioam_cache_ts_node.index, + CACHE_TS_ERROR_RECORDED, recorded); + return frame->n_vectors; +} + +/* + * Node for IP6 iOAM header cache + */ +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (ioam_cache_ts_node) = +{ + .function = ip6_ioam_cache_ts_node_fn, + .name = "ip6-ioam-tunnel-select", + .vector_size = sizeof (u32), + .format_trace = format_cache_ts_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN (cache_ts_error_strings), + .error_strings = cache_ts_error_strings, + .n_next_nodes = IOAM_CACHE_TS_N_NEXT, + /* edit / add dispositions here */ + .next_nodes = + { + [IOAM_CACHE_TS_NEXT_POP_HBYH] = "ip6-pop-hop-by-hop", + [IOAM_CACHE_TS_ERROR_NEXT_DROP] = "error-drop", + }, +}; +/* *INDENT-ON* */ + +typedef struct +{ + u32 next_index; +} ip6_reset_ts_hbh_trace_t; + +/* packet trace format function */ +static u8 * +format_ip6_reset_ts_hbh_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + ip6_reset_ts_hbh_trace_t *t = va_arg (*args, + ip6_reset_ts_hbh_trace_t *); + + s = + format (s, "IP6_IOAM_RESET_TUNNEL_SELECT_HBH: next index %d", + t->next_index); + return s; +} + +vlib_node_registration_t ip6_reset_ts_hbh_node; + +#define foreach_ip6_reset_ts_hbh_error \ +_(PROCESSED, "iOAM Syn/Ack Pkts processed") \ +_(SAVED, "iOAM Syn Pkts state saved") \ +_(REMOVED, "iOAM Syn/Ack Pkts state removed") + +typedef enum +{ +#define _(sym,str) IP6_RESET_TS_HBH_ERROR_##sym, + foreach_ip6_reset_ts_hbh_error +#undef _ + IP6_RESET_TS_HBH_N_ERROR, +} ip6_reset_ts_hbh_error_t; + +static char *ip6_reset_ts_hbh_error_strings[] = { +#define _(sym,string) string, + foreach_ip6_reset_ts_hbh_error +#undef _ +}; + +#define foreach_ip6_ioam_cache_ts_input_next \ + _(IP6_LOOKUP, "ip6-lookup") \ + _(DROP, "error-drop") + +typedef enum +{ +#define _(s,n) IP6_IOAM_CACHE_TS_INPUT_NEXT_##s, + foreach_ip6_ioam_cache_ts_input_next +#undef _ + IP6_IOAM_CACHE_TS_INPUT_N_NEXT, +} ip6_ioam_cache_ts_input_next_t; + + +static uword +ip6_reset_ts_hbh_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + ioam_cache_main_t *cm = &ioam_cache_main; + u32 n_left_from, *from, *to_next; + ip_lookup_next_t next_index; + u32 processed = 0, cache_ts_added = 0; + u64 now; + u8 *rewrite = cm->rewrite; + u32 rewrite_length = vec_len (rewrite); + ioam_e2e_cache_option_t *e2e = 0; + u8 no_of_responses = cm->wait_for_responses; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + now = vlib_time_now (vm); + while (n_left_from >= 4 && n_left_to_next >= 2) + { + u32 bi0, bi1; + vlib_buffer_t *b0, *b1; + u32 next0, next1; + ip6_header_t *ip0, *ip1; + tcp_header_t *tcp0, *tcp1; + u32 tcp_offset0, tcp_offset1; + ip6_hop_by_hop_header_t *hbh0, *hbh1; + u64 *copy_src0, *copy_dst0, *copy_src1, *copy_dst1; + u16 new_l0, new_l1; + u32 pool_index0 = 0, pool_index1 = 0; + + next0 = next1 = IP6_IOAM_CACHE_TS_INPUT_NEXT_IP6_LOOKUP; + /* Prefetch next iteration. */ + { + vlib_buffer_t *p2, *p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + CLIB_PREFETCH (p2->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (p3->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD); + } + + + /* speculatively enqueue b0 to the current next frame */ + to_next[0] = bi0 = from[0]; + to_next[1] = bi1 = from[1]; + from += 2; + to_next += 2; + n_left_from -= 2; + n_left_to_next -= 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + ip0 = vlib_buffer_get_current (b0); + ip1 = vlib_buffer_get_current (b1); + + if (IP_PROTOCOL_TCP != + ip6_locate_header (b0, ip0, IP_PROTOCOL_TCP, &tcp_offset0)) + { + goto NEXT00; + } + tcp0 = (tcp_header_t *) ((u8 *) ip0 + tcp_offset0); + if ((tcp0->flags & TCP_FLAG_SYN) == TCP_FLAG_SYN && + (tcp0->flags & TCP_FLAG_ACK) == 0) + { + if (no_of_responses > 0) + { + /* Create TS select entry */ + if (0 == ioam_cache_ts_add (ip0, + clib_net_to_host_u16 + (tcp0->src_port), + clib_net_to_host_u16 + (tcp0->dst_port), + clib_net_to_host_u32 + (tcp0->seq_number) + 1, + no_of_responses, now, + vm->thread_index, &pool_index0)) + { + cache_ts_added++; + } + } + copy_dst0 = (u64 *) (((u8 *) ip0) - rewrite_length); + copy_src0 = (u64 *) ip0; + + copy_dst0[0] = copy_src0[0]; + copy_dst0[1] = copy_src0[1]; + copy_dst0[2] = copy_src0[2]; + copy_dst0[3] = copy_src0[3]; + copy_dst0[4] = copy_src0[4]; + + vlib_buffer_advance (b0, -(word) rewrite_length); + ip0 = vlib_buffer_get_current (b0); + + hbh0 = (ip6_hop_by_hop_header_t *) (ip0 + 1); + /* $$$ tune, rewrite_length is a multiple of 8 */ + clib_memcpy (hbh0, rewrite, rewrite_length); + e2e = + (ioam_e2e_cache_option_t *) ((u8 *) hbh0 + + cm->rewrite_pool_index_offset); + e2e->pool_id = (u8) vm->thread_index; + e2e->pool_index = pool_index0; + ioam_e2e_id_rewrite_handler ((ioam_e2e_id_option_t *) + ((u8 *) e2e + + sizeof (ioam_e2e_cache_option_t)), + &cm->sr_localsid_ts); + /* Patch the protocol chain, insert the h-b-h (type 0) header */ + hbh0->protocol = ip0->protocol; + ip0->protocol = 0; + new_l0 = + clib_net_to_host_u16 (ip0->payload_length) + rewrite_length; + ip0->payload_length = clib_host_to_net_u16 (new_l0); + processed++; + } + + NEXT00: + if (IP_PROTOCOL_TCP != + ip6_locate_header (b1, ip1, IP_PROTOCOL_TCP, &tcp_offset1)) + { + goto TRACE00; + } + tcp1 = (tcp_header_t *) ((u8 *) ip1 + tcp_offset1); + if ((tcp1->flags & TCP_FLAG_SYN) == TCP_FLAG_SYN && + (tcp1->flags & TCP_FLAG_ACK) == 0) + { + if (no_of_responses > 0) + { + /* Create TS select entry */ + if (0 == ioam_cache_ts_add (ip1, + clib_net_to_host_u16 + (tcp1->src_port), + clib_net_to_host_u16 + (tcp1->dst_port), + clib_net_to_host_u32 + (tcp1->seq_number) + 1, + no_of_responses, now, + vm->thread_index, &pool_index1)) + { + cache_ts_added++; + } + } + + copy_dst1 = (u64 *) (((u8 *) ip1) - rewrite_length); + copy_src1 = (u64 *) ip1; + + copy_dst1[0] = copy_src1[0]; + copy_dst1[1] = copy_src1[1]; + copy_dst1[2] = copy_src1[2]; + copy_dst1[3] = copy_src1[3]; + copy_dst1[4] = copy_src1[4]; + + vlib_buffer_advance (b1, -(word) rewrite_length); + ip1 = vlib_buffer_get_current (b1); + + hbh1 = (ip6_hop_by_hop_header_t *) (ip1 + 1); + /* $$$ tune, rewrite_length is a multiple of 8 */ + clib_memcpy (hbh1, rewrite, rewrite_length); + e2e = + (ioam_e2e_cache_option_t *) ((u8 *) hbh1 + + cm->rewrite_pool_index_offset); + e2e->pool_id = (u8) vm->thread_index; + e2e->pool_index = pool_index1; + ioam_e2e_id_rewrite_handler ((ioam_e2e_id_option_t *) + ((u8 *) e2e + + sizeof (ioam_e2e_cache_option_t)), + &cm->sr_localsid_ts); + /* Patch the protocol chain, insert the h-b-h (type 0) header */ + hbh1->protocol = ip1->protocol; + ip1->protocol = 0; + new_l1 = + clib_net_to_host_u16 (ip1->payload_length) + rewrite_length; + ip1->payload_length = clib_host_to_net_u16 (new_l1); + processed++; + } + + TRACE00: + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE))) + { + if (b0->flags & VLIB_BUFFER_IS_TRACED) + { + ip6_reset_ts_hbh_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->next_index = next0; + } + if (b1->flags & VLIB_BUFFER_IS_TRACED) + { + ip6_reset_ts_hbh_trace_t *t = + vlib_add_trace (vm, node, b1, sizeof (*t)); + t->next_index = next1; + } + + } + + /* verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, + to_next, n_left_to_next, + bi0, bi1, next0, next1); + } + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + u32 next0; + ip6_header_t *ip0; + tcp_header_t *tcp0; + u32 tcp_offset0; + ip6_hop_by_hop_header_t *hbh0; + u64 *copy_src0, *copy_dst0; + u16 new_l0; + u32 pool_index0 = 0; + + next0 = IP6_IOAM_CACHE_TS_INPUT_NEXT_IP6_LOOKUP; + /* speculatively enqueue b0 to the current next frame */ + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + ip0 = vlib_buffer_get_current (b0); + if (IP_PROTOCOL_TCP != + ip6_locate_header (b0, ip0, IP_PROTOCOL_TCP, &tcp_offset0)) + { + goto TRACE0; + } + tcp0 = (tcp_header_t *) ((u8 *) ip0 + tcp_offset0); + if ((tcp0->flags & TCP_FLAG_SYN) == TCP_FLAG_SYN && + (tcp0->flags & TCP_FLAG_ACK) == 0) + { + if (no_of_responses > 0) + { + /* Create TS select entry */ + if (0 == ioam_cache_ts_add (ip0, + clib_net_to_host_u16 + (tcp0->src_port), + clib_net_to_host_u16 + (tcp0->dst_port), + clib_net_to_host_u32 + (tcp0->seq_number) + 1, + no_of_responses, now, + vm->thread_index, &pool_index0)) + { + cache_ts_added++; + } + } + copy_dst0 = (u64 *) (((u8 *) ip0) - rewrite_length); + copy_src0 = (u64 *) ip0; + + copy_dst0[0] = copy_src0[0]; + copy_dst0[1] = copy_src0[1]; + copy_dst0[2] = copy_src0[2]; + copy_dst0[3] = copy_src0[3]; + copy_dst0[4] = copy_src0[4]; + + vlib_buffer_advance (b0, -(word) rewrite_length); + ip0 = vlib_buffer_get_current (b0); + + hbh0 = (ip6_hop_by_hop_header_t *) (ip0 + 1); + /* $$$ tune, rewrite_length is a multiple of 8 */ + clib_memcpy (hbh0, rewrite, rewrite_length); + e2e = + (ioam_e2e_cache_option_t *) ((u8 *) hbh0 + + cm->rewrite_pool_index_offset); + e2e->pool_id = (u8) vm->thread_index; + e2e->pool_index = pool_index0; + ioam_e2e_id_rewrite_handler ((ioam_e2e_id_option_t *) + ((u8 *) e2e + + sizeof (ioam_e2e_cache_option_t)), + &cm->sr_localsid_ts); + /* Patch the protocol chain, insert the h-b-h (type 0) header */ + hbh0->protocol = ip0->protocol; + ip0->protocol = 0; + new_l0 = + clib_net_to_host_u16 (ip0->payload_length) + rewrite_length; + ip0->payload_length = clib_host_to_net_u16 (new_l0); + processed++; + } + TRACE0: + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) + && (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + ip6_reset_ts_hbh_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->next_index = next0; + } + + /* verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + vlib_node_increment_counter (vm, ip6_reset_ts_hbh_node.index, + IP6_RESET_TS_HBH_ERROR_PROCESSED, processed); + vlib_node_increment_counter (vm, ip6_reset_ts_hbh_node.index, + IP6_RESET_TS_HBH_ERROR_SAVED, cache_ts_added); + + return frame->n_vectors; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (ip6_reset_ts_hbh_node) = +{ + .function = ip6_reset_ts_hbh_node_fn, + .name = "ip6-add-syn-hop-by-hop", + .vector_size = sizeof (u32), + .format_trace = format_ip6_reset_ts_hbh_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN (ip6_reset_ts_hbh_error_strings), + .error_strings = ip6_reset_ts_hbh_error_strings, + /* See ip/lookup.h */ + .n_next_nodes = IP6_IOAM_CACHE_TS_INPUT_N_NEXT, + .next_nodes = + { +#define _(s,n) [IP6_IOAM_CACHE_TS_INPUT_NEXT_##s] = n, + foreach_ip6_ioam_cache_ts_input_next +#undef _ + }, +}; + +VLIB_NODE_FUNCTION_MULTIARCH (ip6_reset_ts_hbh_node, ip6_reset_ts_hbh_node_fn) +/* *INDENT-ON* */ + +vlib_node_registration_t ioam_cache_ts_timer_tick_node; + +typedef struct +{ + u32 thread_index; +} ioam_cache_ts_timer_tick_trace_t; + +/* packet trace format function */ +static u8 * +format_ioam_cache_ts_timer_tick_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + ioam_cache_ts_timer_tick_trace_t *t = + va_arg (*args, ioam_cache_ts_timer_tick_trace_t *); + + s = format (s, "IOAM_CACHE_TS_TIMER_TICK: thread index %d", + t->thread_index); + return s; +} + +#define foreach_ioam_cache_ts_timer_tick_error \ + _(TIMER, "Timer events") + +typedef enum +{ +#define _(sym,str) IOAM_CACHE_TS_TIMER_TICK_ERROR_##sym, + foreach_ioam_cache_ts_timer_tick_error +#undef _ + IOAM_CACHE_TS_TIMER_TICK_N_ERROR, +} ioam_cache_ts_timer_tick_error_t; + +static char *ioam_cache_ts_timer_tick_error_strings[] = { +#define _(sym,string) string, + foreach_ioam_cache_ts_timer_tick_error +#undef _ +}; + +void +ioam_cache_ts_timer_node_enable (vlib_main_t * vm, u8 enable) +{ + vlib_node_set_state (vm, ioam_cache_ts_timer_tick_node.index, + enable == + 0 ? VLIB_NODE_STATE_DISABLED : + VLIB_NODE_STATE_POLLING); +} + +void +expired_cache_ts_timer_callback (u32 * expired_timers) +{ + ioam_cache_main_t *cm = &ioam_cache_main; + int i; + u32 pool_index; + u32 thread_index = vlib_get_thread_index (); + u32 count = 0; + + for (i = 0; i < vec_len (expired_timers); i++) + { + /* Get pool index and pool id */ + pool_index = expired_timers[i] & 0x0FFFFFFF; + + /* Handle expiration */ + ioam_cache_ts_send (thread_index, pool_index); + count++; + } + vlib_node_increment_counter (cm->vlib_main, + ioam_cache_ts_timer_tick_node.index, + IOAM_CACHE_TS_TIMER_TICK_ERROR_TIMER, count); +} + +static uword +ioam_cache_ts_timer_tick_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * f) +{ + ioam_cache_main_t *cm = &ioam_cache_main; + u32 my_thread_index = vlib_get_thread_index (); + struct timespec ts, tsrem; + + tw_timer_expire_timers_16t_2w_512sl (&cm->timer_wheels[my_thread_index], + vlib_time_now (vm)); + ts.tv_sec = 0; + ts.tv_nsec = 1000 * 1000 * IOAM_CACHE_TS_TICK; + while (nanosleep (&ts, &tsrem) < 0) + { + ts = tsrem; + } + + return 0; +} +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (ioam_cache_ts_timer_tick_node) = { + .function = ioam_cache_ts_timer_tick_node_fn, + .name = "ioam-cache-ts-timer-tick", + .format_trace = format_ioam_cache_ts_timer_tick_trace, + .type = VLIB_NODE_TYPE_INPUT, + + .n_errors = ARRAY_LEN(ioam_cache_ts_timer_tick_error_strings), + .error_strings = ioam_cache_ts_timer_tick_error_strings, + + .n_next_nodes = 1, + + .state = VLIB_NODE_STATE_DISABLED, + + /* edit / add dispositions here */ + .next_nodes = { + [0] = "error-drop", + }, +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/ipfixcollector/ipfixcollector.c b/src/plugins/ioam/ipfixcollector/ipfixcollector.c new file mode 100644 index 00000000..71b934ec --- /dev/null +++ b/src/plugins/ioam/ipfixcollector/ipfixcollector.c @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vnet/ip/ip.h> +#include <vnet/plugin/plugin.h> +#include <vnet/udp/udp.h> +#include <ioam/ipfixcollector/ipfixcollector.h> + +ipfix_collector_main_t ipfix_collector_main; + +/** + * @brief IP-FIX SetID registration function. + * + * This function can be used by other VPP graph nodes to receive IP-FIX packets + * with a particular setid. + * + * @param vlib_main_t Vlib main of the graph node which is interseted in + * getting IP-Fix packet. + * @param ipfix_client_add_del_t Structure describing the client node which + * is interested in getting the IP-Fix packets for + * a SetID. + * + * @returns 0 on success. + * @returns Error codes(<0) otherwise. + */ +int +ipfix_collector_reg_setid (vlib_main_t * vm, ipfix_client_add_del_t * info) +{ + ipfix_collector_main_t *cm = &ipfix_collector_main; + uword *p = NULL; + int i; + ipfix_client *client = 0; + + if ((!info) || (!info->client_name)) + return IPFIX_COLLECTOR_ERR_INVALID_PARAM; + + p = hash_get (cm->client_reg_table, info->ipfix_setid); + client = p ? pool_elt_at_index (cm->client_reg_pool, (*p)) : NULL; + + if (info->del) + { + if (!client) + return 0; //There is no registered handler, so send success + + hash_unset (cm->client_reg_table, info->ipfix_setid); + vec_free (client->client_name); + pool_put (cm->client_reg_pool, client); + return 0; + } + + if (client) + return IPFIX_COLLECTOR_ERR_REG_EXISTS; + + pool_get (cm->client_reg_pool, client); + i = client - cm->client_reg_pool; + client->client_name = vec_dup (info->client_name); + client->client_node = info->client_node; + client->client_next_node = vlib_node_add_next (vm, + ipfix_collector_node.index, + client->client_node); + client->set_id = info->ipfix_setid; + + hash_set (cm->client_reg_table, info->ipfix_setid, i); + return 0; +} + +static clib_error_t * +ipfix_collector_init (vlib_main_t * vm) +{ + clib_error_t *error = 0; + ipfix_collector_main_t *cm = &ipfix_collector_main; + + cm->vlib_main = vm; + cm->vnet_main = vnet_get_main (); + + cm->client_reg_pool = NULL; + cm->client_reg_table = hash_create (0, sizeof (uword)); + + udp_register_dst_port (vm, + UDP_DST_PORT_ipfix, + ipfix_collector_node.index, 1 /* is_ip4 */ ); + return error; +} + +VLIB_INIT_FUNCTION (ipfix_collector_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/ipfixcollector/ipfixcollector.h b/src/plugins/ioam/ipfixcollector/ipfixcollector.h new file mode 100644 index 00000000..ee570316 --- /dev/null +++ b/src/plugins/ioam/ipfixcollector/ipfixcollector.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PLUGINS_IPFIXCOLLECTOR_PLUGIN_IPFIXCOLLECTOR_IPFIXCOLLECTOR_H_ +#define PLUGINS_IPFIXCOLLECTOR_PLUGIN_IPFIXCOLLECTOR_IPFIXCOLLECTOR_H_ + +#include <vppinfra/pool.h> +#include <vppinfra/hash.h> +#include <vppinfra/error.h> + +#define IPFIX_COLLECTOR_CLIENT_NAME_MAX 64 + +#define IPFIX_COLLECTOR_ERR_INVALID_PARAM -1 +#define IPFIX_COLLECTOR_ERR_REG_EXISTS -2 + +/** @brief Structure other nodes to use for registering with IP-FIX collector. +*/ +typedef struct +{ + /** String containing name of the client interested in getting + ip-fix packets. */ + u8 *client_name; + + /** Node index where packets have to be redirected. */ + u32 client_node; + + /** Setid of IPFix for which client is intereseted in getting packets. */ + u16 ipfix_setid; + + /** Add(0) or del(1) operation. */ + u16 del; +} ipfix_client_add_del_t; + +/** @brief IP-FIX collector internal client structure to store SetID to + client node ID. +*/ +typedef struct +{ + /** String containing name of the client interested in getting + ip-fix packets. */ + u8 *client_name; + + /** Node index where packets have to be redirected. */ + u32 client_node; + + /** ipfix-collector next index where packets have to be redirected. */ + u32 client_next_node; + + /** Setid of IPFix for which client is intereseted in getting packets. */ + u16 set_id; +} ipfix_client; + +/** @brief IP-FIX collector main structure to SetID to client node ID mapping. + @note cache aligned. +*/ +typedef struct +{ + /** Hash table to map IP-FIX setid to a client registration pool. SetId is + key to hash map. */ + uword *client_reg_table; + + /** Pool of Client node information for the IP-FIX SetID. */ + ipfix_client *client_reg_pool; + + /** Pointer to VLib main for the node - ipfix-collector. */ + vlib_main_t *vlib_main; + + /** Pointer to vnet main for convenience. */ + vnet_main_t *vnet_main; +} ipfix_collector_main_t; + +extern vlib_node_registration_t ipfix_collector_node; + +extern ipfix_collector_main_t ipfix_collector_main; + +/** + * @brief IP-FIX SetID registration function. + * + * This function can be used by other VPP graph nodes to receive IP-FIX packets + * with a particular setid. + * + * @param vlib_main_t Vlib main of the graph node which is interseted in + * getting IP-Fix packet. + * @param ipfix_client_add_del_t Structure describing the client node which + * is interested in getting the IP-Fix packets for + * a SetID. + * + * @returns 0 on success. + * @returns Error codes(<0) otherwise. + */ +int +ipfix_collector_reg_setid (vlib_main_t * vm, ipfix_client_add_del_t * info); + +always_inline ipfix_client * +ipfix_collector_get_client (u16 set_id) +{ + ipfix_collector_main_t *cm = &ipfix_collector_main; + uword *p; + + p = hash_get (cm->client_reg_table, set_id); + return (p ? pool_elt_at_index (cm->client_reg_pool, (*p)) : NULL); +} + +#endif /* PLUGINS_IPFIXCOLLECTOR_PLUGIN_IPFIXCOLLECTOR_IPFIXCOLLECTOR_H_ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/ipfixcollector/node.c b/src/plugins/ioam/ipfixcollector/node.c new file mode 100644 index 00000000..fce997ae --- /dev/null +++ b/src/plugins/ioam/ipfixcollector/node.c @@ -0,0 +1,301 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vlib/vlib.h> +#include <vnet/vnet.h> +#include <vnet/pg/pg.h> +#include <vppinfra/error.h> +#include <ioam/ipfixcollector/ipfixcollector.h> +#include <vnet/flow/ipfix_packet.h> + +#define foreach_ipfix_collector_error \ +_(PROCESSED, "Number of IP-Fix packets processed") \ +_(NO_LISTENER, "Number of IP-Fix packets with no listener") + +typedef enum +{ +#define _(sym,str) IPFIX_COLLECTOR_ERROR_##sym, + foreach_ipfix_collector_error +#undef _ + IPFIX_COLLECTOR_N_ERROR, +} flowperpkt_error_t; + +static char *ipfix_collector_error_strings[] = { +#define _(sym,string) string, + foreach_ipfix_collector_error +#undef _ +}; + +typedef enum +{ + IPFIX_COLLECTOR_NEXT_DROP, + IPFIX_COLLECTOR_N_NEXT, +} ipfix_collector_next_t; + +typedef struct +{ + u32 next_node; + u16 set_id; + u16 pad; +} ipfix_collector_trace_t; + +vlib_node_registration_t ipfix_collector_node; + +/* packet trace format function */ +static u8 * +format_ipfix_collector_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + ipfix_collector_trace_t *t = va_arg (*args, ipfix_collector_trace_t *); + + s = format (s, + "IPFIX_COLLECTOR: set_id %u, next_node %u", t->set_id, + t->next_node); + return s; +} + +/** + * @brief Node to receive IP-Fix packets. + * @node ipfix-collector + * + * This function receives IP-FIX packets and forwards them to other graph nodes + * based on SetID field in IP-FIX. + * + * @param vm vlib_main_t corresponding to the current thread. + * @param node vlib_node_runtime_t data for this node. + * @param frame vlib_frame_t whose contents should be dispatched. + * + * @par Graph mechanics: buffer, next index usage + * + * <em>Uses:</em> + * - <code>vlib_buffer_get_current(p0)</code> + * - Parses IP-Fix packet to extract SetId which will be used to decide + * next node where packets should be enqueued. + * + * <em>Next Index:</em> + * - Dispatches the packet to other VPP graph nodes based on their registartion + * for the IP-Fix SetId using API ipfix_collector_reg_setid(). + */ +uword +ipfix_collector_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + u32 n_left_from, next_index, *from, *to_next; + word n_no_listener = 0; + word n_listener = 0; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next >= 2) + { + u32 bi0, bi1; + vlib_buffer_t *b0, *b1; + u32 next0, next1; + ipfix_message_header_t *ipfix0, *ipfix1; + ipfix_set_header_t *set0, *set1; + u16 set_id0, set_id1; + ipfix_client *client0, *client1; + + /* Prefetch next iteration. */ + { + vlib_buffer_t *p2, *p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + + CLIB_PREFETCH (p2->data, + (sizeof (ipfix_message_header_t) + + sizeof (ipfix_set_header_t)), LOAD); + CLIB_PREFETCH (p3->data, + (sizeof (ipfix_message_header_t) + + sizeof (ipfix_set_header_t)), LOAD); + } + + bi0 = from[0]; + bi1 = from[1]; + to_next[0] = bi0; + to_next[1] = bi1; + from += 2; + to_next += 2; + n_left_to_next -= 2; + n_left_from -= 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + ipfix0 = vlib_buffer_get_current (b0); + ipfix1 = vlib_buffer_get_current (b1); + + set0 = (ipfix_set_header_t *) (ipfix0 + 1); + set1 = (ipfix_set_header_t *) (ipfix1 + 1); + + set_id0 = (u16) (clib_net_to_host_u32 (set0->set_id_length) >> 16); + set_id1 = (u16) (clib_net_to_host_u32 (set1->set_id_length) >> 16); + + client0 = ipfix_collector_get_client (set_id0); + client1 = ipfix_collector_get_client (set_id1); + + if (PREDICT_TRUE (NULL != client0)) + { + next0 = client0->client_next_node; + n_listener++; + } + else + { + next0 = IPFIX_COLLECTOR_NEXT_DROP; + n_no_listener++; + } + + if (PREDICT_TRUE (NULL != client1)) + { + next1 = client1->client_next_node; + n_listener++; + } + else + { + next1 = IPFIX_COLLECTOR_NEXT_DROP; + n_no_listener++; + } + + vlib_buffer_advance (b0, + (sizeof (ipfix_message_header_t) + + sizeof (ipfix_set_header_t))); + vlib_buffer_advance (b1, + (sizeof (ipfix_message_header_t) + + sizeof (ipfix_set_header_t))); + + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + ipfix_collector_trace_t *tr = vlib_add_trace (vm, node, + b0, sizeof (*tr)); + tr->next_node = (client0 ? client0->client_node : 0xFFFFFFFF); + tr->set_id = set_id0; + } + if (PREDICT_FALSE (b1->flags & VLIB_BUFFER_IS_TRACED)) + { + ipfix_collector_trace_t *tr = vlib_add_trace (vm, node, + b1, sizeof (*tr)); + tr->next_node = (client1 ? client1->client_node : 0xFFFFFFFF); + tr->set_id = set_id1; + } + + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, + to_next, n_left_to_next, + bi0, bi1, next0, next1); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + u32 next0; + ipfix_message_header_t *ipfix0; + ipfix_set_header_t *set0; + u16 set_id0; + ipfix_client *client0; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + ipfix0 = vlib_buffer_get_current (b0); + + set0 = (ipfix_set_header_t *) (ipfix0 + 1); + + set_id0 = (u16) (clib_net_to_host_u32 (set0->set_id_length) >> 16); + + client0 = ipfix_collector_get_client (set_id0); + + if (PREDICT_TRUE (NULL != client0)) + { + next0 = client0->client_next_node; + n_listener++; + } + else + { + next0 = IPFIX_COLLECTOR_NEXT_DROP; + n_no_listener++; + } + + vlib_buffer_advance (b0, + (sizeof (ipfix_message_header_t) + + sizeof (ipfix_set_header_t))); + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + ipfix_collector_trace_t *tr = vlib_add_trace (vm, node, + b0, sizeof (*tr)); + tr->next_node = (client0 ? client0->client_node : 0xFFFFFFFF); + tr->set_id = set_id0; + } + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + vlib_error_count (vm, node->node_index, + IPFIX_COLLECTOR_ERROR_NO_LISTENER, n_no_listener); + vlib_error_count (vm, node->node_index, + IPFIX_COLLECTOR_ERROR_PROCESSED, n_listener); + return from_frame->n_vectors; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (ipfix_collector_node) = { + .function = ipfix_collector_node_fn, + .name = "ipfix-collector", + .vector_size = sizeof (u32), + .format_trace = format_ipfix_collector_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .n_errors = ARRAY_LEN(ipfix_collector_error_strings), + .error_strings = ipfix_collector_error_strings, + + .n_next_nodes = IPFIX_COLLECTOR_N_NEXT, + + /* edit / add dispositions here */ + .next_nodes = { + [IPFIX_COLLECTOR_NEXT_DROP] = "error-drop", + }, +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/lib-e2e/e2e_util.h b/src/plugins/ioam/lib-e2e/e2e_util.h new file mode 100644 index 00000000..f8a4ebd4 --- /dev/null +++ b/src/plugins/ioam/lib-e2e/e2e_util.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PLUGINS_IOAM_PLUGIN_IOAM_LIB_E2E_E2E_UTIL_H_ +#define PLUGINS_IOAM_PLUGIN_IOAM_LIB_E2E_E2E_UTIL_H_ + +#include <ioam/lib-e2e/ioam_seqno_lib.h> + +/* *INDENT-OFF* */ +typedef CLIB_PACKED(struct { + u8 e2e_type; + u8 reserved; + u32 e2e_data; +}) ioam_e2e_packet_t; +/* *INDENT-ON* */ + +#endif /* PLUGINS_IOAM_PLUGIN_IOAM_LIB_E2E_E2E_UTIL_H_ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/lib-e2e/ioam_seqno_lib.c b/src/plugins/ioam/lib-e2e/ioam_seqno_lib.c new file mode 100644 index 00000000..bf78c1e3 --- /dev/null +++ b/src/plugins/ioam/lib-e2e/ioam_seqno_lib.c @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vlib/vlib.h> +#include <vnet/vnet.h> +#include <ioam/lib-e2e/ioam_seqno_lib.h> + +u8 * +show_ioam_seqno_cmd_fn (u8 * s, ioam_seqno_data * seqno_data, u8 enc) +{ + seqno_rx_info *rx; + + s = format (s, "SeqNo Data:\n"); + if (enc) + { + s = format (s, " Current Seq. Number : %llu\n", seqno_data->seq_num); + } + else + { + rx = &seqno_data->seqno_rx; + s = show_ioam_seqno_analyse_data_fn (s, rx); + } + + format (s, "\n"); + return s; +} + +u8 * +show_ioam_seqno_analyse_data_fn (u8 * s, seqno_rx_info * rx) +{ + s = format (s, " Highest Seq. Number : %llu\n", rx->bitmap.highest); + s = format (s, " Packets received : %llu\n", rx->rx_packets); + s = format (s, " Lost packets : %llu\n", rx->lost_packets); + s = format (s, " Reordered packets : %llu\n", rx->reordered_packets); + s = format (s, " Duplicate packets : %llu\n", rx->dup_packets); + + format (s, "\n"); + return s; +} + +void +ioam_seqno_init_data (ioam_seqno_data * data) +{ + data->seq_num = 0; + ioam_seqno_init_rx_info (&data->seqno_rx); + return; +} + +void +ioam_seqno_init_rx_info (seqno_rx_info * data) +{ + seqno_bitmap *bitmap = &data->bitmap; + bitmap->window_size = SEQNO_WINDOW_SIZE; + bitmap->array_size = SEQNO_WINDOW_ARRAY_SIZE; + bitmap->mask = 32 * SEQNO_WINDOW_ARRAY_SIZE - 1; + bitmap->array[0] = 0x00000000; /* pretend we haven seen sequence numbers 0 */ + bitmap->highest = 0; + + data->dup_packets = 0; + data->lost_packets = 0; + data->reordered_packets = 0; + data->rx_packets = 0; + return; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/lib-e2e/ioam_seqno_lib.h b/src/plugins/ioam/lib-e2e/ioam_seqno_lib.h new file mode 100644 index 00000000..6bd38ff2 --- /dev/null +++ b/src/plugins/ioam/lib-e2e/ioam_seqno_lib.h @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PLUGINS_IOAM_PLUGIN_IOAM_LIB_E2E_IOAM_SEQNO_LIB_H_ +#define PLUGINS_IOAM_PLUGIN_IOAM_LIB_E2E_IOAM_SEQNO_LIB_H_ + +#include <vppinfra/types.h> + +#define SEQ_CHECK_VALUE 0x80000000 /* for seq number wraparound detection */ + +#define SEQNO_WINDOW_SIZE 2048 +#define SEQNO_WINDOW_ARRAY_SIZE 64 + +typedef struct seqno_bitmap_ +{ + u32 window_size; + u32 array_size; + u32 mask; + u32 pad; + u64 highest; + u64 array[SEQNO_WINDOW_ARRAY_SIZE]; /* Will be alloc to array_size */ +} seqno_bitmap; + +typedef struct seqno_rx_info_ +{ + u64 rx_packets; + u64 lost_packets; + u64 reordered_packets; + u64 dup_packets; + seqno_bitmap bitmap; +} seqno_rx_info; + +/* This structure is 64-byte aligned */ +typedef struct ioam_seqno_data_ +{ + union + { + u32 seq_num; /* Useful only for encap node */ + seqno_rx_info seqno_rx; + }; +} ioam_seqno_data; + +static inline void +BIT_SET (u64 * p, u32 n) +{ + p[n >> 5] |= (1 << (n & 31)); +} + +static inline int +BIT_TEST (u64 * p, u32 n) +{ + return p[n >> 5] & (1 << (n & 31)); +} + +static void +BIT_CLEAR (u64 * p, u64 start, int num_bits, u32 mask) +{ + int n, t; + int start_index = (start >> 5); + int mask_index = (mask >> 5); + + start_index &= mask_index; + if (start & 0x1f) + { + int start_bit = (start & 0x1f); + + n = (1 << start_bit) - 1; + t = start_bit + num_bits; + if (t < 32) + { + n |= ~((1 << t) - 1); + p[start_index] &= n; + return; + } + p[start_index] &= n; + start_index = (start_index + 1) & mask_index; + num_bits -= (32 - start_bit); + } + while (num_bits >= 32) + { + p[start_index] = 0; + start_index = (start_index + 1) & mask_index; + num_bits -= 32; + } + n = ~((1 << num_bits) - 1); + p[start_index] &= n; +} + +static inline u8 +seqno_check_wraparound (u32 a, u32 b) +{ + if ((a != b) && (a > b) && ((a - b) > SEQ_CHECK_VALUE)) + { + return 1; + } + return 0; +} + +/* + * Function to analyze the PPC value recevied. + * - Updates the bitmap with received sequence number + * - counts the received/lost/duplicate/reordered packets + */ +inline static void +ioam_analyze_seqno (seqno_rx_info * seqno_rx, u64 seqno) +{ + int diff; + static int peer_dead_count; + seqno_bitmap *bitmap = &seqno_rx->bitmap; + + seqno_rx->rx_packets++; + + if (seqno > bitmap->highest) + { /* new larger sequence number */ + peer_dead_count = 0; + diff = seqno - bitmap->highest; + if (diff < bitmap->window_size) + { + if (diff > 1) + { /* diff==1 is *such* a common case it's a win to optimize it */ + BIT_CLEAR (bitmap->array, bitmap->highest + 1, diff - 1, + bitmap->mask); + seqno_rx->lost_packets += diff - 1; + } + } + else + { + seqno_rx->lost_packets += diff - 1; + memset (bitmap->array, 0, bitmap->array_size * sizeof (u64)); + } + BIT_SET (bitmap->array, seqno & bitmap->mask); + bitmap->highest = seqno; + return; + } + + /* we've seen a bigger seq number before */ + diff = bitmap->highest - seqno; + if (diff >= bitmap->window_size) + { + if (seqno_check_wraparound (bitmap->highest, seqno)) + { + memset (bitmap->array, 0, bitmap->array_size * sizeof (u64)); + BIT_SET (bitmap->array, seqno & bitmap->mask); + bitmap->highest = seqno; + return; + } + else + { + peer_dead_count++; + if (peer_dead_count > 25) + { + peer_dead_count = 0; + memset (bitmap->array, 0, bitmap->array_size * sizeof (u64)); + BIT_SET (bitmap->array, seqno & bitmap->mask); + bitmap->highest = seqno; + } + //ppc_rx->reordered_packets++; + } + return; + } + + if (BIT_TEST (bitmap->array, seqno & bitmap->mask)) + { + seqno_rx->dup_packets++; + return; /* Already seen */ + } + seqno_rx->reordered_packets++; + seqno_rx->lost_packets--; + BIT_SET (bitmap->array, seqno & bitmap->mask); + return; +} + +u8 *show_ioam_seqno_analyse_data_fn (u8 * s, seqno_rx_info * rx); + +u8 *show_ioam_seqno_cmd_fn (u8 * s, ioam_seqno_data * seqno_data, u8 enc); + +void ioam_seqno_init_data (ioam_seqno_data * data); + +void ioam_seqno_init_rx_info (seqno_rx_info * data); + +#endif /* PLUGINS_IOAM_PLUGIN_IOAM_LIB_E2E_IOAM_SEQNO_LIB_H_ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/lib-pot/math64.h b/src/plugins/ioam/lib-pot/math64.h new file mode 100644 index 00000000..4c608a37 --- /dev/null +++ b/src/plugins/ioam/lib-pot/math64.h @@ -0,0 +1,159 @@ +/* + * math64.h provides the 64 bit unsigned integer add, multiply followed by modulo operation + * The linux/math64.h provides divide and multiply 64 bit integers but: + * 1. multiply: mul_u64_u64_shr - only returns 64 bits of the result and has to be called + * twice to get the complete 128 bits of the result. + * 2. Modulo operation of the result of addition and multiplication of u64 that may result + * in integers > 64 bits is not supported + * Hence this header to combine add/multiply followed by modulo of u64 integrers + * always resulting in u64. + * + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef include_vnet_math64_h +#define include_vnet_math64_h +#include <stdint.h> + +/* + * multiplies and returns result in hi and lo + */ +static inline void mul64by64(u64 a, u64 b, u64 * hi, u64 * lo) +{ + u64 a_lo = (u64) (uint32_t) a; + u64 a_hi = a >> 32; + u64 b_lo = (u64) (u32) b; + u64 b_hi = b >> 32; + + u64 p0 = a_lo * b_lo; + u64 p1 = a_lo * b_hi; + u64 p2 = a_hi * b_lo; + u64 p3 = a_hi * b_hi; + + u32 cy = (u32) (((p0 >> 32) + (u32) p1 + (u32) p2) >> 32); + + *lo = p0 + (p1 << 32) + (p2 << 32); + *hi = p3 + (p1 >> 32) + (p2 >> 32) + cy; + return; +} + +#define TWO64 18446744073709551616.0 + +static inline u64 mod128by64(u64 x, u64 y, u64 m, double di) +{ + u64 q1, q2, q; + u64 p1, p0; + double dq; + + /* calculate quotient first pass 53 bits */ + dq = (TWO64 * (double)x + (double)y) * di; + + if (dq >= TWO64) + q1 = 0xfffffffffffff800L; + else + q1 = dq; + + /* q1 * m to compare the product to the dividend. */ + mul64by64(q1, m, &p1, &p0); + + /* Adjust quotient. is it > actual result: */ + if (x < p1 || (x == p1 && y < p0)) + { + /* q1 > quotient. calculate abs remainder */ + x = p1 - (x + (p0 < y)); + y = p0 - y; + + /* use the remainder as new dividend to adjust quotient */ + q2 = (u64) ((TWO64 * (double)x + (double)y) * di); + mul64by64(q2, m, &p1, &p0); + + q = q1 - q2; + if (x < p1 || (x == p1 && y <= p0)) + { + y = p0 - y; + } + else + { + y = p0 - y; + y += m; + q--; + } + } + else + { + x = x - (p1 + (y < p0)); + y = y - p0; + + q2 = (u64) ((TWO64 * (double)x + (double)y) * di); + mul64by64(q2, m, &p1, &p0); + + q = q1 + q2; + if (x < p1 || (x == p1 && y < p0)) + { + y = y - p0; + y += m; + q--; + } + else + { + y = y - p0; + if (y >= m) + { + y -= m; + q++; + } + } + } + + return y; +} + +/* + * returns a % p + */ +static inline u64 mod64by64(u64 a, u64 p, u64 primeinv) +{ + return (mod128by64(0, a, p, primeinv)); +} + +static inline void add64(u64 a, u64 b, u64 * whi, u64 * wlo) +{ + *wlo = a + b; + if (*wlo < a) + *whi = 1; + +} + +/* + * returns (a + b)%p + */ +static inline u64 add64_mod(u64 a, u64 b, u64 p, double pi) +{ + u64 shi = 0, slo = 0; + + add64(a, b, &shi, &slo); + return (mod128by64(shi, slo, p, pi)); +} + +/* + * returns (ab) % p + */ +static inline u64 mul64_mod(u64 a, u64 b, u64 p, double pi) +{ + u64 phi = 0, plo = 0; + + mul64by64(a, b, &phi, &plo); + return (mod128by64(phi, plo, p, pi)); +} + +#endif diff --git a/src/plugins/ioam/lib-pot/pot.api b/src/plugins/ioam/lib-pot/pot.api new file mode 100644 index 00000000..c377cde0 --- /dev/null +++ b/src/plugins/ioam/lib-pot/pot.api @@ -0,0 +1,105 @@ +/* Hey Emacs use -*- mode: C -*- */ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/** \brief Proof of Transit(POT): Set POT profile + @param id - id of the profile + @param validator - True/False to indicate if this is verifier + @param secret_key - Verification key + @param secret_share - Share of the 1st polynomial + @param prime - Prime number used for modulo operation + @param max_bits - Max bits to be used for Random number generation + @param lpc - Lagrange basis polynomial + @param polynomial_public - pre-evaluated public polynomial + @param list_name_len - length of the name of this profile list + @param list_name - name of this profile list +*/ +autoreply define pot_profile_add { + u32 client_index; + u32 context; + u8 id; + u8 validator; + u64 secret_key; + u64 secret_share; + u64 prime; + u8 max_bits; + u64 lpc; + u64 polynomial_public; + u8 list_name_len; + u8 list_name[0]; +}; + +/** \brief Proof of Transit(POT): Activate POT profile in the list + @param id - id of the profile + @param list_name_len - length of the name of this profile list + @param list_name - name of this profile list +*/ +autoreply define pot_profile_activate { + u32 client_index; + u32 context; + u8 id; + u8 list_name_len; + u8 list_name[0]; +}; + +/** \brief Delete POT Profile + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param list_name_len - length of the name of the profile list + @param list_name - name of profile list to delete +*/ +autoreply define pot_profile_del { + u32 client_index; + u32 context; + u8 list_name_len; + u8 list_name[0]; +}; + +/** \brief Show POT Profiles + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param id - id of the profile +*/ +define pot_profile_show_config_dump { + u32 client_index; + u32 context; + u8 id; +}; + +/** \brief Show POT profile reply + @param id - id of the profile + @param validator - True/False to indicate if this is verifier + @param secret_key - Verification key + @param secret_share - Share of the 1st polynomial + @param prime - Prime number used for modulo operation + @param max_bits - Max bits to be used for Random number generation + @param lpc - Lagrange basis polynomial + @param polynomial_public - pre-evaluated public polynomial + @param list_name_len - length of the name of this profile list + @param list_name - name of this profile list +*/ +define pot_profile_show_config_details { + u32 context; + i32 retval; + u8 id; + u8 validator; + u64 secret_key; + u64 secret_share; + u64 prime; + u64 bit_mask; + u64 lpc; + u64 polynomial_public; +}; diff --git a/src/plugins/ioam/lib-pot/pot_all_api_h.h b/src/plugins/ioam/lib-pot/pot_all_api_h.h new file mode 100644 index 00000000..63967c45 --- /dev/null +++ b/src/plugins/ioam/lib-pot/pot_all_api_h.h @@ -0,0 +1,16 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* Include the generated file, see BUILT_SOURCES in Makefile.am */ +#include <ioam/lib-pot/pot.api.h> diff --git a/src/plugins/ioam/lib-pot/pot_api.c b/src/plugins/ioam/lib-pot/pot_api.c new file mode 100644 index 00000000..cc1b7b76 --- /dev/null +++ b/src/plugins/ioam/lib-pot/pot_api.c @@ -0,0 +1,239 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + *------------------------------------------------------------------ + * pot_api.c - Proof of Transit related APIs to create + * and maintain profiles + *------------------------------------------------------------------ + */ + +#include <vnet/vnet.h> +#include <vnet/plugin/plugin.h> +#include <ioam/lib-pot/pot_util.h> + +#include <vlibapi/api.h> +#include <vlibmemory/api.h> +#include <vlibsocket/api.h> + +/* define message IDs */ +#include <ioam/lib-pot/pot_msg_enum.h> + +/* define message structures */ +#define vl_typedefs +#include <ioam/lib-pot/pot_all_api_h.h> +#undef vl_typedefs + +/* define generated endian-swappers */ +#define vl_endianfun +#include <ioam/lib-pot/pot_all_api_h.h> +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__) +#define vl_printfun +#include <ioam/lib-pot/pot_all_api_h.h> +#undef vl_printfun + +/* Get the API version number */ +#define vl_api_version(n,v) static u32 api_version=(v); +#include <ioam/lib-pot/pot_all_api_h.h> +#undef vl_api_version + +#define REPLY_MSG_ID_BASE sm->msg_id_base +#include <vlibapi/api_helper_macros.h> + +/* List of message types that this plugin understands */ +#define foreach_pot_plugin_api_msg \ +_(POT_PROFILE_ADD, pot_profile_add) \ +_(POT_PROFILE_ACTIVATE, pot_profile_activate) \ +_(POT_PROFILE_DEL, pot_profile_del) \ +_(POT_PROFILE_SHOW_CONFIG_DUMP, pot_profile_show_config_dump) \ + +static void vl_api_pot_profile_add_t_handler +(vl_api_pot_profile_add_t *mp) +{ + pot_main_t * sm = &pot_main; + int rv = 0; + vl_api_pot_profile_add_reply_t * rmp; + u8 id; + pot_profile *profile = NULL; + u8 *name = 0; + + if (mp->list_name_len) + name = format(0, "%s", mp->list_name); + + pot_profile_list_init(name); + id = mp->id; + profile = pot_profile_find(id); + if (profile) { + rv = pot_profile_create(profile, + clib_net_to_host_u64(mp->prime), + clib_net_to_host_u64(mp->polynomial_public), + clib_net_to_host_u64(mp->lpc), + clib_net_to_host_u64(mp->secret_share)); + if (rv != 0) + goto ERROROUT; + if (1 == mp->validator) + (void)pot_set_validator(profile, clib_net_to_host_u64(mp->secret_key)); + (void)pot_profile_set_bit_mask(profile, mp->max_bits); + } else { + rv = -3; + } + ERROROUT: + vec_free(name); + REPLY_MACRO(VL_API_POT_PROFILE_ADD_REPLY); +} + +static void send_pot_profile_details(vl_api_pot_profile_show_config_dump_t *mp, u8 id) +{ + vl_api_pot_profile_show_config_details_t * rmp; + pot_main_t * sm = &pot_main; + pot_profile *profile = pot_profile_find(id); + int rv = 0; + if(profile){ + REPLY_MACRO2(VL_API_POT_PROFILE_SHOW_CONFIG_DETAILS, + rmp->id=id; + rmp->validator=profile->validator; + rmp->secret_key=clib_host_to_net_u64(profile->secret_key); + rmp->secret_share=clib_host_to_net_u64(profile->secret_share); + rmp->prime=clib_host_to_net_u64(profile->prime); + rmp->bit_mask=clib_host_to_net_u64(profile->bit_mask); + rmp->lpc=clib_host_to_net_u64(profile->lpc); + rmp->polynomial_public=clib_host_to_net_u64(profile->poly_pre_eval); + ); + } + else{ + REPLY_MACRO2(VL_API_POT_PROFILE_SHOW_CONFIG_DETAILS, + rmp->id=id; + rmp->validator=0; + rmp->secret_key=0; + rmp->secret_share=0; + rmp->prime=0; + rmp->bit_mask=0; + rmp->lpc=0; + rmp->polynomial_public=0; + ); + } +} + +static void vl_api_pot_profile_show_config_dump_t_handler +(vl_api_pot_profile_show_config_dump_t *mp) +{ + u8 id = mp->id; + u8 dump_call_id = ~0; + if(dump_call_id==id){ + for(id=0;id<MAX_POT_PROFILES;id++) + send_pot_profile_details(mp,id); + } + else + send_pot_profile_details(mp,id); +} + +static void vl_api_pot_profile_activate_t_handler +(vl_api_pot_profile_activate_t *mp) +{ + pot_main_t * sm = &pot_main; + int rv = 0; + vl_api_pot_profile_add_reply_t * rmp; + u8 id; + u8 *name = NULL; + + if (mp->list_name_len) + name = format(0, "%s", mp->list_name); + if (!pot_profile_list_is_enabled(name)) { + rv = -1; + } else { + id = mp->id; + rv = pot_profile_set_active(id); + } + + vec_free(name); + REPLY_MACRO(VL_API_POT_PROFILE_ACTIVATE_REPLY); +} + + +static void vl_api_pot_profile_del_t_handler +(vl_api_pot_profile_del_t *mp) +{ + pot_main_t * sm = &pot_main; + int rv = 0; + vl_api_pot_profile_del_reply_t * rmp; + + clear_pot_profiles(); + + REPLY_MACRO(VL_API_POT_PROFILE_DEL_REPLY); +} + +/* Set up the API message handling tables */ +static clib_error_t * +pot_plugin_api_hookup (vlib_main_t *vm) +{ + pot_main_t * sm = &pot_main; +#define _(N,n) \ + vl_msg_api_set_handlers((VL_API_##N + sm->msg_id_base), \ + #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_pot_plugin_api_msg; +#undef _ + + return 0; +} + +#define vl_msg_name_crc_list +#include <ioam/lib-pot/pot_all_api_h.h> +#undef vl_msg_name_crc_list + +static void +setup_message_id_table (pot_main_t * sm, api_main_t * am) +{ +#define _(id,n,crc) \ + vl_msg_api_add_msg_name_crc (am, #n "_" #crc, id + sm->msg_id_base); + foreach_vl_msg_name_crc_pot; +#undef _ +} + +static clib_error_t * pot_init (vlib_main_t * vm) +{ + pot_main_t * sm = &pot_main; + clib_error_t * error = 0; + u8 * name; + + bzero(sm, sizeof(pot_main)); + (void)pot_util_init(); + + sm->vlib_main = vm; + sm->vnet_main = vnet_get_main(); + + name = format (0, "ioam_pot_%08x%c", api_version, 0); + + /* Ask for a correctly-sized block of API message decode slots */ + sm->msg_id_base = vl_msg_api_get_msg_ids + ((char *) name, VL_MSG_FIRST_AVAILABLE); + + error = pot_plugin_api_hookup (vm); + + /* Add our API messages to the global name_crc hash table */ + setup_message_id_table (sm, &api_main); + + vec_free(name); + + return error; +} + +VLIB_INIT_FUNCTION (pot_init); diff --git a/src/plugins/ioam/lib-pot/pot_msg_enum.h b/src/plugins/ioam/lib-pot/pot_msg_enum.h new file mode 100644 index 00000000..a4a88bed --- /dev/null +++ b/src/plugins/ioam/lib-pot/pot_msg_enum.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef included_pot_msg_enum_h +#define included_pot_msg_enum_h + +#include <vppinfra/byte_order.h> + +#define vl_msg_id(n,h) n, +typedef enum { +#include <ioam/lib-pot/pot_all_api_h.h> + /* We'll want to know how many messages IDs we need... */ + VL_MSG_FIRST_AVAILABLE, +} vl_msg_id_t; +#undef vl_msg_id + +#endif /* included_pot_msg_enum_h */ diff --git a/src/plugins/ioam/lib-pot/pot_test.c b/src/plugins/ioam/lib-pot/pot_test.c new file mode 100644 index 00000000..1c6dd02d --- /dev/null +++ b/src/plugins/ioam/lib-pot/pot_test.c @@ -0,0 +1,333 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + *------------------------------------------------------------------ + * pot_test.c - test harness for pot plugin + *------------------------------------------------------------------ + */ + +#include <vat/vat.h> +#include <vlibapi/api.h> +#include <vlibmemory/api.h> +#include <vlibsocket/api.h> +#include <vppinfra/error.h> + +#define __plugin_msg_base pot_test_main.msg_id_base +#include <vlibapi/vat_helper_macros.h> + +/* Declare message IDs */ +#include <ioam/lib-pot/pot_msg_enum.h> + +/* define message structures */ +#define vl_typedefs +#include <ioam/lib-pot/pot_all_api_h.h> +#undef vl_typedefs + +/* declare message handlers for each api */ + +#define vl_endianfun /* define message structures */ +#include <ioam/lib-pot/pot_all_api_h.h> +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) +#define vl_printfun +#include <ioam/lib-pot/pot_all_api_h.h> +#undef vl_printfun + +/* Get the API version number. */ +#define vl_api_version(n,v) static u32 api_version=(v); +#include <ioam/lib-pot/pot_all_api_h.h> +#undef vl_api_version + + +typedef struct { + /* API message ID base */ + u16 msg_id_base; + vat_main_t *vat_main; +} pot_test_main_t; + +pot_test_main_t pot_test_main; + +#define foreach_standard_reply_retval_handler \ +_(pot_profile_add_reply) \ +_(pot_profile_activate_reply) \ +_(pot_profile_del_reply) + +#define foreach_custom_reply_retval_handler \ +_(pot_profile_show_config_details, \ + errmsg(" ID:%d\n",mp->id); \ + errmsg(" Validator:%d\n",mp->validator); \ + errmsg(" secret_key:%Lx\n",clib_net_to_host_u64(mp->secret_key)); \ + errmsg(" secret_share:%Lx\n",clib_net_to_host_u64(mp->secret_share)); \ + errmsg(" prime:%Lx\n",clib_net_to_host_u64(mp->prime)); \ + errmsg(" bitmask:%Lx\n",clib_net_to_host_u64(mp->bit_mask)); \ + errmsg(" lpc:%Lx\n",clib_net_to_host_u64(mp->lpc)); \ + errmsg(" public poly:%Lx\n",clib_net_to_host_u64(mp->polynomial_public)); \ + ) + +#define _(n) \ + static void vl_api_##n##_t_handler \ + (vl_api_##n##_t * mp) \ + { \ + vat_main_t * vam = pot_test_main.vat_main; \ + i32 retval = ntohl(mp->retval); \ + if (vam->async_mode) { \ + vam->async_errors += (retval < 0); \ + } else { \ + vam->retval = retval; \ + vam->result_ready = 1; \ + } \ + } +foreach_standard_reply_retval_handler; +#undef _ + +#define _(n,body) \ + static void vl_api_##n##_t_handler \ + (vl_api_##n##_t * mp) \ + { \ + vat_main_t * vam = pot_test_main.vat_main; \ + i32 retval = ntohl(mp->retval); \ + if (vam->async_mode) { \ + vam->async_errors += (retval < 0); \ + } else { \ + vam->retval = retval; \ + vam->result_ready = 1; \ + } \ + do{body;}while(0); \ + } +foreach_custom_reply_retval_handler; +#undef _ + +/* + * Table of message reply handlers, must include boilerplate handlers + * we just generated + */ +#define foreach_vpe_api_reply_msg \ +_(POT_PROFILE_ADD_REPLY, pot_profile_add_reply) \ +_(POT_PROFILE_ACTIVATE_REPLY, pot_profile_activate_reply) \ +_(POT_PROFILE_DEL_REPLY, pot_profile_del_reply) \ +_(POT_PROFILE_SHOW_CONFIG_DETAILS, pot_profile_show_config_details) + +static int api_pot_profile_add (vat_main_t *vam) +{ +#define MAX_BITS 64 + unformat_input_t *input = vam->input; + vl_api_pot_profile_add_t *mp; + u8 *name = NULL; + u64 prime = 0; + u64 secret_share = 0; + u64 secret_key = 0; + u32 bits = MAX_BITS; + u64 lpc = 0, poly2 = 0; + u8 id = 0; + int rv = 0; + int ret; + + while (unformat_check_input(input) != UNFORMAT_END_OF_INPUT) + { + if (unformat(input, "name %s", &name)) + ; + else if(unformat(input, "id %d", &id)) + ; + else if (unformat(input, "validator-key 0x%Lx", &secret_key)) + ; + else if (unformat(input, "prime-number 0x%Lx", &prime)) + ; + else if (unformat(input, "secret-share 0x%Lx", &secret_share)) + ; + else if (unformat(input, "polynomial-public 0x%Lx", &poly2)) + ; + else if (unformat(input, "lpc 0x%Lx", &lpc)) + ; + else if (unformat(input, "bits-in-random %u", &bits)) + { + if (bits > MAX_BITS) + bits = MAX_BITS; + } + else + break; + } + + if (!name) + { + errmsg ("name required\n"); + rv = -99; + goto OUT; + } + + M2(POT_PROFILE_ADD, mp, vec_len(name)); + + mp->list_name_len = vec_len(name); + clib_memcpy(mp->list_name, name, mp->list_name_len); + mp->secret_share = clib_host_to_net_u64(secret_share); + mp->polynomial_public = clib_host_to_net_u64(poly2); + mp->lpc = clib_host_to_net_u64(lpc); + mp->prime = clib_host_to_net_u64(prime); + if (secret_key != 0) + { + mp->secret_key = clib_host_to_net_u64(secret_key); + mp->validator = 1; + } + else + { + mp->validator = 0; + } + mp->id = id; + mp->max_bits = bits; + + S(mp); + W (ret); + return ret; + +OUT: + vec_free(name); + return(rv); +} + +static int api_pot_profile_activate (vat_main_t *vam) +{ +#define MAX_BITS 64 + unformat_input_t *input = vam->input; + vl_api_pot_profile_activate_t *mp; + u8 *name = NULL; + u8 id = 0; + int rv = 0; + int ret; + + while (unformat_check_input(input) != UNFORMAT_END_OF_INPUT) + { + if (unformat(input, "name %s", &name)) + ; + else if(unformat(input, "id %d", &id)) + ; + else + break; + } + + if (!name) + { + errmsg ("name required\n"); + rv = -99; + goto OUT; + } + + M2(POT_PROFILE_ACTIVATE, mp, vec_len(name)); + + mp->list_name_len = vec_len(name); + clib_memcpy(mp->list_name, name, mp->list_name_len); + mp->id = id; + + S(mp); + W (ret); + return ret; + +OUT: + vec_free(name); + return(rv); +} + + +static int api_pot_profile_del (vat_main_t *vam) +{ + vl_api_pot_profile_del_t *mp; + int ret; + + M(POT_PROFILE_DEL, mp); + mp->list_name_len = 0; + S(mp); + W (ret); + return ret; +} + +static int api_pot_profile_show_config_dump (vat_main_t *vam) +{ + unformat_input_t *input = vam->input; + vl_api_pot_profile_show_config_dump_t *mp; + u8 id = 0; + int ret; + + while(unformat_check_input(input) != UNFORMAT_END_OF_INPUT) + { + if(unformat(input,"id %d",&id)); + else + break; + } + M(POT_PROFILE_SHOW_CONFIG_DUMP, mp); + + mp->id = id; + + S(mp); + W (ret); + return ret; +} + +/* + * List of messages that the api test plugin sends, + * and that the data plane plugin processes + */ +#define foreach_vpe_api_msg \ +_(pot_profile_add, "name <name> id [0-1] " \ + "prime-number <0xu64> bits-in-random [0-64] " \ + "secret-share <0xu64> lpc <0xu64> polynomial-public <0xu64> " \ + "[validator-key <0xu64>] [validity <0xu64>]") \ +_(pot_profile_activate, "name <name> id [0-1] ") \ +_(pot_profile_del, "[id <nn>]") \ +_(pot_profile_show_config_dump, "id [0-1]") + +static void +pot_vat_api_hookup (vat_main_t *vam) +{ + pot_test_main_t * sm = &pot_test_main; + /* Hook up handlers for replies from the data plane plug-in */ +#define _(N,n) \ + vl_msg_api_set_handlers((VL_API_##N + sm->msg_id_base), \ + #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_vpe_api_reply_msg; +#undef _ + + /* API messages we can send */ +#define _(n,h) hash_set_mem (vam->function_by_name, #n, api_##n); + foreach_vpe_api_msg; +#undef _ + + /* Help strings */ +#define _(n,h) hash_set_mem (vam->help_by_name, #n, h); + foreach_vpe_api_msg; +#undef _ +} + +clib_error_t * vat_plugin_register (vat_main_t *vam) +{ + pot_test_main_t * sm = &pot_test_main; + u8 * name; + + sm->vat_main = vam; + + name = format (0, "ioam_pot_%08x%c", api_version, 0); + sm->msg_id_base = vl_client_get_first_plugin_msg_id ((char *) name); + + if (sm->msg_id_base != (u16) ~0) + pot_vat_api_hookup (vam); + + vec_free(name); + + return 0; +} diff --git a/src/plugins/ioam/lib-pot/pot_util.c b/src/plugins/ioam/lib-pot/pot_util.c new file mode 100644 index 00000000..a253ad41 --- /dev/null +++ b/src/plugins/ioam/lib-pot/pot_util.c @@ -0,0 +1,445 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vnet/vnet.h> +#include <stdint.h> +#include <time.h> +#include <string.h> +#include <vppinfra/mem.h> +#include "math64.h" +#include "pot_util.h" + +pot_main_t pot_main; + +static void pot_profile_cleanup(pot_profile *profile); + +static void pot_main_profiles_reset (void) +{ + pot_main_t *sm = &pot_main; + int i = 0; + + for (i = 0; i < MAX_POT_PROFILES; i++) + { + pot_profile_cleanup(&(sm->profile_list[i])); + } + sm->active_profile_id = 0; + if (sm->profile_list_name) + vec_free(sm->profile_list_name); + sm->profile_list_name = NULL; +} + +int pot_util_init (void) +{ + pot_main_profiles_reset(); + + return(0); +} + +static void pot_profile_init(pot_profile * new, u8 id) +{ + if (new) + { + memset(new, 0, sizeof(pot_profile)); + new->id = id; + } +} + +pot_profile *pot_profile_find(u8 id) +{ + pot_main_t *sm = &pot_main; + + if (id < MAX_POT_PROFILES) + { + return (&(sm->profile_list[id])); + } + return (NULL); +} +static int pot_profile_name_equal (u8 *name0, u8 *name1) +{ + int len0, len1; + + len0 = vec_len (name0); + len1 = vec_len (name1); + if (len0 != len1) + return(0); + return (0==strncmp ((char *) name0, (char *)name1, len0)); +} + +int pot_profile_list_is_enabled (u8 *name) +{ + pot_main_t *sm = &pot_main; + return (pot_profile_name_equal(sm->profile_list_name, name)); +} + +void pot_profile_list_init(u8 * profile_list_name) +{ + pot_main_t *sm = &pot_main; + int i = 0; + + /* If it is the same profile list skip reset */ + if (pot_profile_name_equal(sm->profile_list_name, profile_list_name)) + { + return; + } + + pot_main_profiles_reset(); + if (vec_len(profile_list_name)) + sm->profile_list_name = (u8 *)vec_dup(profile_list_name); + else + sm->profile_list_name = 0; + sm->active_profile_id = 0; + + for (i = 0; i < MAX_POT_PROFILES; i++) + { + pot_profile_init(&(sm->profile_list[i]), i); + } +} + +static void pot_profile_cleanup(pot_profile * profile) +{ + u16 id = profile->id; + + memset(profile, 0, sizeof(pot_profile)); + profile->id = id; /* Restore id alone */ +} + +int pot_profile_create(pot_profile * profile, u64 prime, + u64 poly2, u64 lpc, u64 secret_share) +{ + if (profile && !profile->in_use) + { + pot_profile_cleanup(profile); + profile->prime = prime; + profile->primeinv = 1.0 / prime; + profile->lpc = lpc; + profile->poly_pre_eval = poly2; + profile->secret_share = secret_share; + profile->total_pkts_using_this_profile = 0; + profile->valid = 1; + return(0); + } + + return(-1); +} + +int pot_set_validator(pot_profile * profile, u64 key) +{ + if (profile && !profile->in_use) + { + profile->validator = 1; + profile->secret_key = key; + return(0); + } + return(-1); +} + +always_inline u64 pot_update_cumulative_inline(u64 cumulative, u64 random, + u64 secret_share, u64 prime, u64 lpc, u64 pre_split, double prime_inv) +{ + u64 share_random = 0; + u64 cumulative_new = 0; + + /* + * calculate split share for random + */ + share_random = add64_mod(pre_split, random, prime, prime_inv); + + /* + * lpc * (share_secret + share_random) + */ + share_random = add64_mod(share_random, secret_share, prime, prime_inv); + share_random = mul64_mod(share_random, lpc, prime, prime_inv); + + cumulative_new = add64_mod(cumulative, share_random, prime, prime_inv); + + return (cumulative_new); +} + +u64 pot_update_cumulative(pot_profile * profile, u64 cumulative, u64 random) +{ + if (profile && profile->valid != 0) + { + return (pot_update_cumulative_inline(cumulative, random, profile->secret_share, + profile->prime, profile->lpc, profile->poly_pre_eval, + profile->primeinv)); + } + return (0); +} + +always_inline u8 pot_validate_inline(u64 secret, u64 prime, double prime_inv, + u64 cumulative, u64 random) +{ + if (cumulative == (random + secret)) + { + return (1); + } + else if (cumulative == add64_mod(random, secret, prime, prime_inv)) + { + return (1); + } + return (0); +} + +/* + * return True if the cumulative matches secret from a profile + */ +u8 pot_validate(pot_profile * profile, u64 cumulative, u64 random) +{ + if (profile && profile->validator) + { + return (pot_validate_inline(profile->secret_key, profile->prime, + profile->primeinv, cumulative, random)); + } + return (0); +} + +/* + * Utility function to get random number per pack + */ +u64 pot_generate_random(pot_profile * profile) +{ + u64 random = 0; + int32_t second_half; + static u32 seed = 0; + + if (PREDICT_FALSE(!seed)) + seed = random_default_seed(); + + /* + * Upper 4 bytes seconds + */ + random = (u64) time(NULL); + + random &= 0xffffffff; + random = random << 32; + /* + * Lower 4 bytes random number + */ + second_half = random_u32(&seed); + + random |= second_half; + + if (PREDICT_TRUE(profile != NULL)) + { + random &= profile->bit_mask; + } + return (random); +} + +int pot_profile_set_bit_mask(pot_profile * profile, u16 bits) +{ + int sizeInBits; + + if (profile && !profile->in_use) + { + sizeInBits = sizeof(profile->bit_mask) * 8; + profile->bit_mask = + (bits >= + sizeInBits ? (u64) - 1 : (u64) ((u64) 1 << (u64) bits) - 1); + return(0); + } + return(-1); +} + +clib_error_t *clear_pot_profile_command_fn(vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + + pot_main_profiles_reset(); + + return 0; +} + +void clear_pot_profiles() +{ + clear_pot_profile_command_fn(0, 0, 0); +} + +VLIB_CLI_COMMAND(clear_pot_profile_command) = +{ +.path = "clear pot profile", +.short_help = "clear pot profile [<index>|all]", +.function = clear_pot_profile_command_fn, +}; + +static clib_error_t *set_pot_profile_command_fn(vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + u64 prime; + u64 secret_share; + u64 secret_key; + u8 validator = 0; + u32 profile_id = ~0; + u32 bits; + u64 lpc = 0, poly2 = 0; + pot_profile *profile = NULL; + u8 *profile_list_name = NULL; + + bits = MAX_BITS; + + while (unformat_check_input(input) != UNFORMAT_END_OF_INPUT) + { + if (unformat(input, "name %s", + &profile_list_name)); + else if (unformat(input, "id %d", &profile_id)) + ; + else if (unformat(input, "validate-key 0x%Lx", &secret_key)) + validator = 1; + else if (unformat(input, "prime-number 0x%Lx", &prime)) + ; + else if (unformat(input, "secret_share 0x%Lx", &secret_share)) + ; + else if (unformat(input, "polynomial2 0x%Lx", &poly2)) + ; + else if (unformat(input, "lpc 0x%Lx", &lpc)) + ; + else if (unformat(input, "bits-in-random %d", &bits)) + { + if (bits > MAX_BITS) + bits = MAX_BITS; + } + else + break; + } + if (profile_list_name == 0) + { + return clib_error_return(0, "Name cannot be null"); + } + pot_profile_list_init(profile_list_name); + profile = pot_profile_find(profile_id); + + if (profile) + { + pot_profile_create(profile, prime, poly2, lpc, secret_share); + if (validator) + pot_set_validator(profile, secret_key); + pot_profile_set_bit_mask(profile, bits); + } + vec_free(profile_list_name); + return 0; +} + +VLIB_CLI_COMMAND(set_pot_profile_command) = +{ +.path = "set pot profile", +.short_help = "set pot profile name <string> id [0-1] [validator-key 0xu64] \ + prime-number 0xu64 secret_share 0xu64 lpc 0xu64 \ + polynomial2 0xu64 bits-in-random [0-64] ", +.function = set_pot_profile_command_fn, +}; + +static clib_error_t *set_pot_profile_activate_command_fn(vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + pot_main_t *sm = &pot_main; + u8 *profile_list_name = NULL; + u32 id = 0; + clib_error_t *result = NULL; + + while (unformat_check_input(input) != UNFORMAT_END_OF_INPUT) + { + if (unformat(input, "name %s", + &profile_list_name)); + else if (unformat(input, "id %d", &id)) + ; + else + return clib_error_return(0, "unknown input `%U'", + format_unformat_error, input); + } + if (profile_list_name == 0) + { + return clib_error_return(0, "Name cannot be null"); + } + + if (!pot_profile_list_is_enabled(profile_list_name)) { + result = clib_error_return(0, "%s list is not enabled, profile in use %s", + profile_list_name, sm->profile_list_name); + } else if (0 != pot_profile_set_active((u8)id)) { + result = clib_error_return(0, "Profile %d not defined in %s", + id, sm->profile_list_name); + } + vec_free(profile_list_name); + return result; +} + +VLIB_CLI_COMMAND(set_pot_profile_activate_command) = +{ +.path = "set pot profile-active", +.short_help = "set pot profile-active name <string> id [0-1]", +.function = set_pot_profile_activate_command_fn, +}; + +static clib_error_t *show_pot_profile_command_fn(vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + pot_main_t *sm = &pot_main; + pot_profile *p = NULL; + u16 i; + u8 *s = 0; + + if (vec_len(sm->profile_list_name) == 0) + { + s = format(s, "POT Profiles not configured\n"); + vlib_cli_output(vm, "%v", s); + return 0; + } + s = format(s, "Profile list in use : %s\n",sm->profile_list_name); + for (i = 0; i < MAX_POT_PROFILES; i++) + { + p = pot_profile_find(i); + if (p->valid == 0) + continue; + s = format(s, "POT Profile at index: %d\n", i); + s = format(s, " Id : %d\n", p->id); + s = format(s, " Validator : %s (%d)\n", + (p->validator) ? "True" : "False", p->validator); + if (p->validator == 1) + s = format(s, " Secret key : 0x%Lx (%Ld)\n", + p->secret_key, p->secret_key); + s = format(s, " Secret share : 0x%Lx (%Ld)\n", + p->secret_share, p->secret_share); + s = format(s, " Prime number : 0x%Lx (%Ld)\n", + p->prime, p->prime); + s = format(s, "2nd polynomial(eval) : 0x%Lx (%Ld)\n", + p->poly_pre_eval, p->poly_pre_eval); + s = format(s, " LPC : 0x%Lx (%Ld)\n", p->lpc, p->lpc); + + s = format(s, " Bit mask : 0x%Lx (%Ld)\n", + p->bit_mask, p->bit_mask); + } + + p = pot_profile_find(sm->active_profile_id); + + if (p && p->valid && p->in_use) { + s = format(s, "\nProfile index in use: %d\n", sm->active_profile_id); + s = format(s, "Pkts passed : 0x%Lx (%Ld)\n", + p->total_pkts_using_this_profile, + p->total_pkts_using_this_profile); + if (pot_is_decap(p)) + s = format(s, " This is Decap node. \n"); + } else { + s = format(s, "\nProfile index in use: None\n"); + } + vlib_cli_output(vm, "%v", s); + vec_free(s); + + return 0; +} + +VLIB_CLI_COMMAND(show_pot_profile_command) = +{ +.path = "show pot profile", +.short_help = "show pot profile", +.function = show_pot_profile_command_fn, +}; diff --git a/src/plugins/ioam/lib-pot/pot_util.h b/src/plugins/ioam/lib-pot/pot_util.h new file mode 100644 index 00000000..9df31fae --- /dev/null +++ b/src/plugins/ioam/lib-pot/pot_util.h @@ -0,0 +1,195 @@ +/* + * pot_util.h -- Proof Of Transit Utility Header + * + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef include_vnet_pot_util_h +#define include_vnet_pot_util_h + +#include <vnet/ip/ip6_hop_by_hop.h> +#define debug_ioam debug_ioam_fn +/* Dont change this size 256. This is there across multiple components */ +#define PATH_NAME_SIZE 256 + +/* Ring size. this should be same as the one in ODL. Do not change this + without change in ODL. */ +#define MAX_POT_PROFILES 2 + +/** + * Usage: + * + * On any node that participates in Proof of Transit: + * + * Step 1: Initialize this library by calling pot_init() + * Step 2: Setup a proof of transit profile that contains all the parameters needed to compute cumulative: + * Call these functions: + * pot_profile_find + * pot_profile_create + * pot_profile_set_bit_mask - To setup how large we want the numbers used in the computation and random number <= 64 bits + * Step 2a: For validator do this: + * pot_set_validator + * Step 2b: On initial node enable the profile to be used: + * pot_profile_set_active / pot_profile_get_active will return the profile + * Step 3a: At the initial node to generate Random number that will be read by all other nodes: + * pot_generate_random + * Step 3b: At all nodes including initial and verifier call this to compute cumulative: + * pot_update_cumulative + * Step 4: At the verifier: + * pot_validate + * + */ + +typedef struct pot_profile_ +{ + u8 id : 1; + u8 valid : 1; + u8 in_use : 1; + u64 random; + u8 validator; + u64 secret_key; + u64 secret_share; + u64 prime; + u64 lpc; + u64 poly_pre_eval; + u64 bit_mask; + u64 limit; + double primeinv; + u64 total_pkts_using_this_profile; +} pot_profile; + +typedef struct { + /* Name of the default profile list in use*/ + u8 *profile_list_name; + pot_profile profile_list[MAX_POT_PROFILES]; + /* number of profiles in the list */ + u8 active_profile_id : 1; + + /* API message ID base */ + u16 msg_id_base; + + /* convenience */ + vlib_main_t * vlib_main; + vnet_main_t * vnet_main; +} pot_main_t; + +extern pot_main_t pot_main; + +/* + * Initialize proof of transit + */ +int pot_util_init(void); +void pot_profile_list_init(u8 * name); + + +/* + * Find a pot profile by ID + */ +pot_profile *pot_profile_find(u8 id); + +static inline u16 pot_profile_get_id(pot_profile * profile) +{ + if (profile) + { + return (profile->id); + } + return (0); +} + +/* setup and clean up profile */ +int pot_profile_create(pot_profile * profile, u64 prime, + u64 poly2, u64 lpc, u64 secret_share); +/* + * Setup profile as a validator + */ +int pot_set_validator(pot_profile * profile, u64 key); + +/* + * Setup max bits to be used for random number generation + */ +#define MAX_BITS 64 +int pot_profile_set_bit_mask(pot_profile * profile, u16 bits); + +/* + * Given a random and cumulative compute the new cumulative for a given profile + */ +u64 pot_update_cumulative(pot_profile * profile, u64 cumulative, u64 random); + +/* + * return True if the cumulative matches secret from a profile + */ +u8 pot_validate(pot_profile * profile, u64 cumulative, u64 random); + +/* + * Utility function to get random number per pack + */ +u64 pot_generate_random(pot_profile * profile); + + +extern void clear_pot_profiles(); +extern int pot_profile_list_is_enabled(u8 *name); + +static inline u8 pot_is_decap(pot_profile * p) +{ + return (p->validator == 1); +} + +static inline int pot_profile_set_active (u8 id) +{ + pot_main_t *sm = &pot_main; + pot_profile *profile = NULL; + pot_profile *current_active_prof = NULL; + + current_active_prof = pot_profile_find(sm->active_profile_id); + profile = pot_profile_find(id); + if (profile && profile->valid) { + sm->active_profile_id = id; + current_active_prof->in_use = 0; + profile->in_use = 1; + return(0); + } + return(-1); +} +static inline u8 pot_profile_get_active_id (void) +{ + pot_main_t *sm = &pot_main; + return (sm->active_profile_id); +} + +static inline pot_profile * pot_profile_get_active (void) +{ + pot_main_t *sm = &pot_main; + pot_profile *profile = NULL; + profile = pot_profile_find(sm->active_profile_id); + if (profile && profile->in_use) + return(profile); + return (NULL); +} + +static inline void pot_profile_reset_usage_stats (pot_profile *pow) +{ + if (pow) { + pow->total_pkts_using_this_profile = 0; + } +} + +static inline void pot_profile_incr_usage_stats (pot_profile *pow) +{ + if (pow) { + pow->total_pkts_using_this_profile++; + } +} + + +#endif diff --git a/src/plugins/ioam/lib-trace/trace.api b/src/plugins/ioam/lib-trace/trace.api new file mode 100644 index 00000000..2f45c6e2 --- /dev/null +++ b/src/plugins/ioam/lib-trace/trace.api @@ -0,0 +1,70 @@ +/* Hey Emacs use -*- mode: C -*- */ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/** \brief iOAM6 Trace - Set the iOAM6 trace profile + @param trace_type - Type of trace requested + @param num_elts - Number of trace elements to be inserted + @param node_id - Trace Node ID + @param trace_tsp- Timestamp resolution + @param app_data - Application specific opaque +*/ +autoreply define trace_profile_add { + u32 client_index; + u32 context; + u8 trace_type; + u8 num_elts; + u8 trace_tsp; + u32 node_id; + u32 app_data; +}; + +/** \brief Delete trace Profile + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request +*/ +autoreply define trace_profile_del { + u32 client_index; + u32 context; +}; + +/** \brief Show trace Profile + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request +*/ +define trace_profile_show_config { + u32 client_index; + u32 context; +}; + +/** \brief Show trace config response + @param context - sender context, to match reply w/ request + @param retval - return value for request + @param trace_type - Type of trace requested + @param num_elts - Number of trace elements to be inserted + @param node_id - Trace Node ID + @param trace_tsp- Timestamp resolution + @param app_data - Application specific opaque +*/ +define trace_profile_show_config_reply { + u32 context; + i32 retval; + u8 trace_type; + u8 num_elts; + u8 trace_tsp; + u32 node_id; + u32 app_data; +}; diff --git a/src/plugins/ioam/lib-trace/trace_all_api_h.h b/src/plugins/ioam/lib-trace/trace_all_api_h.h new file mode 100644 index 00000000..223f9545 --- /dev/null +++ b/src/plugins/ioam/lib-trace/trace_all_api_h.h @@ -0,0 +1,16 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* Include the generated file, see BUILT_SOURCES in Makefile.am */ +#include <ioam/lib-trace/trace.api.h> diff --git a/src/plugins/ioam/lib-trace/trace_api.c b/src/plugins/ioam/lib-trace/trace_api.c new file mode 100644 index 00000000..6889859b --- /dev/null +++ b/src/plugins/ioam/lib-trace/trace_api.c @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + *------------------------------------------------------------------ + * trace_api.c - iOAM Trace related APIs to create + * and maintain profiles + *------------------------------------------------------------------ + */ + +#include <vnet/vnet.h> +#include <vnet/plugin/plugin.h> +#include <ioam/lib-trace/trace_util.h> +#include <ioam/lib-trace/trace_config.h> + +#include <vlibapi/api.h> +#include <vlibmemory/api.h> +#include <vlibsocket/api.h> + +/* define message IDs */ +#include <ioam/lib-trace/trace_msg_enum.h> + +/* define message structures */ +#define vl_typedefs +#include <ioam/lib-trace/trace_all_api_h.h> +#undef vl_typedefs + +/* define generated endian-swappers */ +#define vl_endianfun +#include <ioam/lib-trace/trace_all_api_h.h> +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__) +#define vl_printfun +#include <ioam/lib-trace/trace_all_api_h.h> +#undef vl_printfun + +/* Get the API version number */ +#define vl_api_version(n,v) static u32 api_version=(v); +#include <ioam/lib-trace/trace_all_api_h.h> +#undef vl_api_version + +/* + * A handy macro to set up a message reply. + * Assumes that the following variables are available: + * mp - pointer to request message + * rmp - pointer to reply message type + * rv - return value + */ + +#define TRACE_REPLY_MACRO(t) \ +do { \ + unix_shared_memory_queue_t * q = \ + vl_api_client_index_to_input_queue (mp->client_index); \ + if (!q) \ + return; \ + \ + rmp = vl_msg_api_alloc (sizeof (*rmp)); \ + rmp->_vl_msg_id = ntohs((t)+sm->msg_id_base); \ + rmp->context = mp->context; \ + rmp->retval = ntohl(rv); \ + \ + vl_msg_api_send_shmem (q, (u8 *)&rmp); \ +} while(0); + +/* *INDENT-OFF* */ +#define TRACE_REPLY_MACRO2(t, body) \ +do { \ + unix_shared_memory_queue_t * q; \ + rv = vl_msg_api_pd_handler (mp, rv); \ + q = vl_api_client_index_to_input_queue (mp->client_index); \ + if (!q) \ + return; \ + \ + rmp = vl_msg_api_alloc (sizeof (*rmp)); \ + rmp->_vl_msg_id = ntohs((t)+sm->msg_id_base); \ + rmp->context = mp->context; \ + rmp->retval = ntohl(rv); \ + do {body;} while (0); \ + vl_msg_api_send_shmem (q, (u8 *)&rmp); \ +} while(0); +/* *INDENT-ON* */ + +/* List of message types that this plugin understands */ + +#define foreach_trace_plugin_api_msg \ +_(TRACE_PROFILE_ADD, trace_profile_add) \ +_(TRACE_PROFILE_DEL, trace_profile_del) \ +_(TRACE_PROFILE_SHOW_CONFIG, trace_profile_show_config) + +static void vl_api_trace_profile_add_t_handler + (vl_api_trace_profile_add_t * mp) +{ + trace_main_t *sm = &trace_main; + int rv = 0; + vl_api_trace_profile_add_reply_t *rmp; + trace_profile *profile = NULL; + + profile = trace_profile_find (); + if (profile) + { + rv = + trace_profile_create (profile, mp->trace_type, mp->num_elts, + mp->trace_tsp, ntohl (mp->node_id), + ntohl (mp->app_data)); + if (rv != 0) + goto ERROROUT; + } + else + { + rv = -3; + } +ERROROUT: + TRACE_REPLY_MACRO (VL_API_TRACE_PROFILE_ADD_REPLY); +} + + +static void vl_api_trace_profile_del_t_handler + (vl_api_trace_profile_del_t * mp) +{ + trace_main_t *sm = &trace_main; + int rv = 0; + vl_api_trace_profile_del_reply_t *rmp; + + clear_trace_profiles (); + + TRACE_REPLY_MACRO (VL_API_TRACE_PROFILE_DEL_REPLY); +} + +static void vl_api_trace_profile_show_config_t_handler + (vl_api_trace_profile_show_config_t * mp) +{ + trace_main_t *sm = &trace_main; + vl_api_trace_profile_show_config_reply_t *rmp; + int rv = 0; + trace_profile *profile = trace_profile_find (); + if (profile->valid) + { + TRACE_REPLY_MACRO2 (VL_API_TRACE_PROFILE_SHOW_CONFIG_REPLY, + rmp->trace_type = profile->trace_type; + rmp->num_elts = profile->num_elts; + rmp->trace_tsp = profile->trace_tsp; + rmp->node_id = htonl (profile->node_id); + rmp->app_data = htonl (profile->app_data); + ); + } + else + { + TRACE_REPLY_MACRO2 (VL_API_TRACE_PROFILE_SHOW_CONFIG_REPLY, + rmp->trace_type = 0; + rmp->num_elts = 0; rmp->trace_tsp = 0; + rmp->node_id = 0; rmp->app_data = 0; + ); + } +} + +/* Set up the API message handling tables */ +static clib_error_t * +trace_plugin_api_hookup (vlib_main_t * vm) +{ + trace_main_t *sm = &trace_main; +#define _(N,n) \ + vl_msg_api_set_handlers((VL_API_##N + sm->msg_id_base), \ + #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_trace_plugin_api_msg; +#undef _ + + return 0; +} + +#define vl_msg_name_crc_list +#include <ioam/lib-trace/trace_all_api_h.h> +#undef vl_msg_name_crc_list + +static void +setup_message_id_table (trace_main_t * sm, api_main_t * am) +{ +#define _(id,n,crc) \ + vl_msg_api_add_msg_name_crc (am, #n "_" #crc, id + sm->msg_id_base); + foreach_vl_msg_name_crc_trace; +#undef _ +} + +static clib_error_t * +trace_init (vlib_main_t * vm) +{ + trace_main_t *sm = &trace_main; + clib_error_t *error = 0; + u8 *name; + + bzero (sm, sizeof (trace_main)); + (void) trace_util_init (); + + sm->vlib_main = vm; + sm->vnet_main = vnet_get_main (); + + name = format (0, "ioam_trace_%08x%c", api_version, 0); + + /* Ask for a correctly-sized block of API message decode slots */ + sm->msg_id_base = vl_msg_api_get_msg_ids + ((char *) name, VL_MSG_FIRST_AVAILABLE); + + error = trace_plugin_api_hookup (vm); + + /* Add our API messages to the global name_crc hash table */ + setup_message_id_table (sm, &api_main); + + vec_free (name); + + return error; +} + +VLIB_INIT_FUNCTION (trace_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/lib-trace/trace_config.h b/src/plugins/ioam/lib-trace/trace_config.h new file mode 100644 index 00000000..d9fa9ff2 --- /dev/null +++ b/src/plugins/ioam/lib-trace/trace_config.h @@ -0,0 +1,41 @@ +/* + * trace_config.h -- iOAM trace configuration utility routines + * + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef include_vnet_trace_config_h +#define include_vnet_trace_config_h + +extern trace_main_t trace_main; + +always_inline trace_profile * +trace_profile_find (void) +{ + trace_main_t *sm = &trace_main; + + return (&(sm->profile)); +} + + + +#endif + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/lib-trace/trace_msg_enum.h b/src/plugins/ioam/lib-trace/trace_msg_enum.h new file mode 100644 index 00000000..78c35665 --- /dev/null +++ b/src/plugins/ioam/lib-trace/trace_msg_enum.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef included_trace_msg_enum_h +#define included_trace_msg_enum_h + +#include <vppinfra/byte_order.h> + +#define vl_msg_id(n,h) n, +typedef enum { +#include <ioam/lib-trace/trace_all_api_h.h> + /* We'll want to know how many messages IDs we need... */ + VL_MSG_FIRST_AVAILABLE, +} vl_msg_id_t; +#undef vl_msg_id + +#endif /* included_trace_msg_enum_h */ diff --git a/src/plugins/ioam/lib-trace/trace_test.c b/src/plugins/ioam/lib-trace/trace_test.c new file mode 100644 index 00000000..1e287dee --- /dev/null +++ b/src/plugins/ioam/lib-trace/trace_test.c @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + *------------------------------------------------------------------ + * trace_test.c - test harness for trace plugin + *------------------------------------------------------------------ + */ + +#include <vat/vat.h> +#include <vlibapi/api.h> +#include <vlibmemory/api.h> +#include <vlibsocket/api.h> +#include <vppinfra/error.h> + +#define __plugin_msg_base trace_test_main.msg_id_base +#include <vlibapi/vat_helper_macros.h> + +/* Declare message IDs */ +#include <ioam/lib-trace/trace_msg_enum.h> + +/* define message structures */ +#define vl_typedefs +#include <ioam/lib-trace/trace_all_api_h.h> +#undef vl_typedefs + +/* declare message handlers for each api */ + +#define vl_endianfun /* define message structures */ +#include <ioam/lib-trace/trace_all_api_h.h> +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) +#define vl_printfun +#include <ioam/lib-trace/trace_all_api_h.h> +#undef vl_printfun + +/* Get the API version number. */ +#define vl_api_version(n,v) static u32 api_version=(v); +#include <ioam/lib-trace/trace_all_api_h.h> +#undef vl_api_version + + +typedef struct +{ + /* API message ID base */ + u16 msg_id_base; + vat_main_t *vat_main; +} trace_test_main_t; + +trace_test_main_t trace_test_main; + +#define foreach_standard_reply_retval_handler \ +_(trace_profile_add_reply) \ +_(trace_profile_del_reply) + +#define foreach_custom_reply_handler \ +_(trace_profile_show_config_reply, \ + if(mp->trace_type) \ + { \ + errmsg(" Trace Type : 0x%x (%d)\n",mp->trace_type, mp->trace_type); \ + errmsg(" Trace timestamp precision : %d \n",mp->trace_tsp); \ + errmsg(" Node Id : 0x%x (%d)\n",htonl(mp->node_id), htonl(mp->node_id)); \ + errmsg(" App Data : 0x%x (%d)\n",htonl(mp->app_data), htonl(mp->app_data)); \ + } \ + else errmsg("No valid trace profile configuration found\n");) +#define _(n) \ + static void vl_api_##n##_t_handler \ + (vl_api_##n##_t * mp) \ + { \ + vat_main_t * vam = trace_test_main.vat_main; \ + i32 retval = ntohl(mp->retval); \ + if (vam->async_mode) { \ + vam->async_errors += (retval < 0); \ + } else { \ + vam->retval = retval; \ + vam->result_ready = 1; \ + } \ + } +foreach_standard_reply_retval_handler; +#undef _ + +#define _(n,body) \ + static void vl_api_##n##_t_handler \ + (vl_api_##n##_t * mp) \ + { \ + vat_main_t * vam = trace_test_main.vat_main; \ + i32 retval = ntohl(mp->retval); \ + if (vam->async_mode) { \ + vam->async_errors += (retval < 0); \ + } else { \ + vam->retval = retval; \ + vam->result_ready = 1; \ + } \ + if(retval>=0)do{body;} while(0); \ + else errmsg("Error, retval: %d",retval); \ + } +foreach_custom_reply_handler; +#undef _ +/* + * Table of message reply handlers, must include boilerplate handlers + * we just generated + */ +#define foreach_vpe_api_reply_msg \ +_(TRACE_PROFILE_ADD_REPLY, trace_profile_add_reply) \ +_(TRACE_PROFILE_DEL_REPLY, trace_profile_del_reply) \ +_(TRACE_PROFILE_SHOW_CONFIG_REPLY, trace_profile_show_config_reply) + +static int +api_trace_profile_add (vat_main_t * vam) +{ + unformat_input_t *input = vam->input; + vl_api_trace_profile_add_t *mp; + u8 trace_type = 0; + u8 num_elts = 0; + u32 node_id = 0; + u32 app_data = 0; + u8 trace_tsp = 0; + int ret; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "trace-type 0x%x", &trace_type)) + ; + else if (unformat (input, "trace-elts %d", &num_elts)) + ; + else if (unformat (input, "trace-tsp %d", &trace_tsp)) + ; + else if (unformat (input, "node-id 0x%x", &node_id)) + ; + else if (unformat (input, "app-data 0x%x", &app_data)) + ; + + else + break; + } + + + M (TRACE_PROFILE_ADD, mp); + + mp->trace_type = trace_type; + mp->trace_tsp = trace_tsp; + mp->node_id = htonl (node_id); + mp->app_data = htonl (app_data); + mp->num_elts = num_elts; + + S (mp); + W (ret); + return ret; +} + + + +static int +api_trace_profile_del (vat_main_t * vam) +{ + vl_api_trace_profile_del_t *mp; + int ret; + + M (TRACE_PROFILE_DEL, mp); + S (mp); + W (ret); + return ret; +} + +static int +api_trace_profile_show_config (vat_main_t * vam) +{ + vl_api_trace_profile_show_config_t *mp; + int ret; + + M (TRACE_PROFILE_SHOW_CONFIG, mp); + S (mp); + W (ret); + return ret; +} + +/* + * List of messages that the api test plugin sends, + * and that the data plane plugin processes + */ +#define foreach_vpe_api_msg \ +_(trace_profile_add, ""\ + "trace-type <0x1f|0x3|0x9|0x11|0x19> trace-elts <nn> trace-tsp <0|1|2|3> node-id <node id in hex> app-data <app_data in hex>") \ +_(trace_profile_del, "[id <nn>]") \ +_(trace_profile_show_config, "[id <nn>]") + + +static void +ioam_trace_vat_api_hookup (vat_main_t * vam) +{ + trace_test_main_t *sm = &trace_test_main; + /* Hook up handlers for replies from the data plane plug-in */ +#define _(N,n) \ + vl_msg_api_set_handlers((VL_API_##N + sm->msg_id_base), \ + #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_vpe_api_reply_msg; +#undef _ + + /* API messages we can send */ +#define _(n,h) hash_set_mem (vam->function_by_name, #n, api_##n); + foreach_vpe_api_msg; +#undef _ + + /* Help strings */ +#define _(n,h) hash_set_mem (vam->help_by_name, #n, h); + foreach_vpe_api_msg; +#undef _ +} + +clib_error_t * +vat_plugin_register (vat_main_t * vam) +{ + trace_test_main_t *sm = &trace_test_main; + u8 *name; + + sm->vat_main = vam; + + name = format (0, "ioam_trace_%08x%c", api_version, 0); + sm->msg_id_base = vl_client_get_first_plugin_msg_id ((char *) name); + + if (sm->msg_id_base != (u16) ~ 0) + ioam_trace_vat_api_hookup (vam); + + vec_free (name); + + return 0; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/lib-trace/trace_util.c b/src/plugins/ioam/lib-trace/trace_util.c new file mode 100644 index 00000000..b316a236 --- /dev/null +++ b/src/plugins/ioam/lib-trace/trace_util.c @@ -0,0 +1,207 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vnet/vnet.h> +#include <stdint.h> +#include <time.h> +#include <string.h> +#include <vppinfra/mem.h> +#include "trace_util.h" +#include "trace_config.h" + +trace_main_t trace_main; + +static int +trace_profile_cleanup (trace_profile * profile) +{ + + memset (profile, 0, sizeof (trace_profile)); + profile->trace_tsp = TSP_MICROSECONDS; /* Micro seconds */ + ip6_trace_profile_cleanup (); /* lib-trace_TODO: Remove this once IOAM-IPv6 transport is a plugin */ + return 0; + +} + +static int +trace_main_profiles_reset (void) +{ + int rv; + + trace_main_t *sm = &trace_main; + rv = trace_profile_cleanup (&(sm->profile)); + return (rv); +} + +int +trace_util_init (void) +{ + int rv; + + rv = trace_main_profiles_reset (); + return (rv); +} + + +int +trace_profile_create (trace_profile * profile, u8 trace_type, u8 num_elts, + u32 trace_tsp, u32 node_id, u32 app_data) +{ + + if (!trace_type || !num_elts || !(node_id)) + { + return (-1); + } + if (profile && !profile->valid) + { + //rv = trace_profile_cleanup (profile); + profile->trace_type = trace_type; + profile->num_elts = num_elts; + profile->trace_tsp = trace_tsp; + profile->node_id = node_id; + profile->app_data = app_data; + profile->valid = 1; + + /* lib-trace_TODO: Remove this once IOAM-IPv6 transport is a plugin */ + ip6_trace_profile_setup (); + return (0); + } + + return (-1); +} + + + +clib_error_t * +clear_trace_profile_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + + trace_main_profiles_reset (); + return 0; +} + +void +clear_trace_profiles (void) +{ + clear_trace_profile_command_fn (0, 0, 0); +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND(clear_trace_profile_command) = +{ +.path = "clear ioam-trace profile", +.short_help = "clear ioam-trace profile [<index>|all]", +.function = clear_trace_profile_command_fn, +}; +/* *INDENT-ON* */ + +static clib_error_t * +set_trace_profile_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + u8 trace_type = 0; + u8 num_elts = 0; + u32 node_id = 0; + u32 app_data = 0; + u32 trace_tsp = 0; + trace_profile *profile = NULL; + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "trace-type 0x%x", &trace_type)); + else if (unformat (input, "trace-elts %d", &num_elts)); + else if (unformat (input, "trace-tsp %d", &trace_tsp)); + else if (unformat (input, "node-id 0x%x", &node_id)); + else if (unformat (input, "app-data 0x%x", &app_data)); + else + break; + } + profile = trace_profile_find (); + if (profile) + { + trace_profile_create (profile, trace_type, num_elts, trace_tsp, + node_id, app_data); + } + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (set_trace_profile_command, static) = +{ +.path = "set ioam-trace profile", +.short_help = "set ioam-trace \ + trace-type <0x1f|0x3|0x9|0x11|0x19> trace-elts <nn> trace-tsp <0|1|2|3> \ + node-id <node id in hex> app-data <app_data in hex>", +.function = set_trace_profile_command_fn, +}; +/* *INDENT-ON* */ + +static clib_error_t * +show_trace_profile_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + trace_profile *p = NULL; + u8 *s = 0; + p = trace_profile_find (); + if (!(p && p->valid)) + { + s = format (s, "\nTrace configuration not valid\n"); + vlib_cli_output (vm, "%v", s); + vec_free (s); + return 0; + } + s = format (s, " HOP BY HOP OPTIONS - TRACE CONFIG - \n"); + s = format (s, " Trace Type : 0x%x (%d)\n", + p->trace_type, p->trace_type); + s = + format (s, " Trace timestamp precision : %d (%s)\n", + p->trace_tsp, + (p->trace_tsp == + TSP_SECONDS) ? "Seconds" : ((p->trace_tsp == + TSP_MILLISECONDS) ? + "Milliseconds" + : (((p->trace_tsp == + TSP_MICROSECONDS) ? + "Microseconds" : + "Nanoseconds")))); + s = format (s, " Num of trace nodes : %d\n", p->num_elts); + s = + format (s, " Node-id : 0x%x (%d)\n", + p->node_id, p->node_id); + s = + format (s, " App Data : 0x%x (%d)\n", + p->app_data, p->app_data); + vlib_cli_output (vm, "%v", s); + vec_free (s); + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_trace_profile_command, static) = +{ +.path = "show ioam-trace profile", +.short_help = "show ioam-trace profile", +.function = show_trace_profile_command_fn, +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/lib-trace/trace_util.h b/src/plugins/ioam/lib-trace/trace_util.h new file mode 100644 index 00000000..61f18d91 --- /dev/null +++ b/src/plugins/ioam/lib-trace/trace_util.h @@ -0,0 +1,256 @@ +/* + * trace_util.h -- Trace Profile Utility header + * + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef include_vnet_trace_util_h +#define include_vnet_trace_util_h + +#define debug_ioam debug_ioam_fn + + +/** + * Usage: + * + * On any node that participates in iOAM Trace. + * + * Step 1: Initialize this library by calling trace_init() + * Step 2: Setup a trace profile that contains all the parameters needed to compute cumulative: + * Call these functions: + * trace_profile_find + * trace_profile_create + * Step 2a: On initial node enable the profile to be used: + * trace_profile_set_active / trace_profile_get_active will return the profile + * Step 4: TBD + * trace_validate + * + */ + +typedef struct trace_profile_ +{ + u8 valid:1; + u8 trace_type; + u8 num_elts; + /* Configured node-id */ + u32 node_id; + u32 app_data; + u32 trace_tsp; +} trace_profile; + +typedef struct +{ + /* Name of the default profile list in use */ + trace_profile profile; + + /* API message ID base */ + u16 msg_id_base; + + /* convenience */ + vlib_main_t *vlib_main; + vnet_main_t *vnet_main; +} trace_main_t; + + +/* + * Initialize Trace profile + */ +int trace_util_init (void); + + +/* setup and clean up profile */ +int trace_profile_create (trace_profile * profile, u8 trace_type, u8 num_elts, + u32 trace_tsp, u32 node_id, u32 app_data); + +void clear_trace_profiles (void); + +/* *INDENT-OFF* */ +typedef CLIB_PACKED (struct +{ + u8 ioam_trace_type; + u8 data_list_elts_left; + u32 elts[0]; /* Variable type. So keep it generic */ +}) ioam_trace_hdr_t; +/* *INDENT-ON* */ + + + +#define BIT_TTL_NODEID (1<<0) +#define BIT_ING_INTERFACE (1<<1) +#define BIT_EGR_INTERFACE (1<<2) +#define BIT_TIMESTAMP (1<<3) +#define BIT_APPDATA (1<<4) +#define BIT_LOOPBACK (1<<5) +#define BIT_LOOPBACK_REPLY (1<<6) +#define TRACE_TYPE_MASK 0x7F /* Mask of all above bits */ + +#define TRACE_TYPE_IF_TS_APP_LOOP 0x3F + +/* + 0x00011111 iOAM-trace-type is 0x00011111 then the format of node + data is: + + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Hop_Lim | node_id | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | ingress_if_id | egress_if_id | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + timestamp + + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | app_data | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + +*/ +#define TRACE_TYPE_IF_TS_APP 0x1f +typedef struct +{ + u32 ttl_node_id; + u16 ingress_if; + u16 egress_if; + u32 timestamp; + u32 app_data; +} ioam_trace_if_ts_app_t; + +/* + 0x00000111 iOAM-trace-type is 0x00000111 then the format is: + + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Hop_Lim | node_id | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | ingress_if_id | egress_if_id | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + +*/ + +#define TRACE_TYPE_IF 0x03 +typedef struct +{ + u32 ttl_node_id; + u16 ingress_if; + u16 egress_if; +} ioam_trace_if_t; + +/* + 0x00001001 iOAM-trace-type is 0x00001001 then the format is: + + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Hop_Lim | node_id | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + timestamp + + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + +*/ + +#define TRACE_TYPE_TS 0x09 +typedef struct +{ + u32 ttl_node_id; + u32 timestamp; +} ioam_trace_ts_t; + +/* + 0x00010001 iOAM-trace-type is 0x00010001 then the format is: + + + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Hop_Lim | node_id | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | app_data | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + +*/ + + +#define TRACE_TYPE_APP 0x11 +typedef struct +{ + u32 ttl_node_id; + u32 app_data; +} ioam_trace_app_t; + +/* + + 0x00011001 iOAM-trace-type is 0x00011001 then the format is: + + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Hop_Lim | node_id | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + timestamp + + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | app_data | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +*/ + +#define TRACE_TYPE_TS_APP 0x19 +typedef struct +{ + u32 ttl_node_id; + u32 timestamp; + u32 app_data; +} ioam_trace_ts_app_t; + +static inline u8 +fetch_trace_data_size (u16 trace_type) +{ + u8 trace_data_size = 0; + + if ((trace_type & TRACE_TYPE_IF_TS_APP) == TRACE_TYPE_IF_TS_APP) + trace_data_size = sizeof (ioam_trace_if_ts_app_t); + else if ((trace_type & TRACE_TYPE_IF) == TRACE_TYPE_IF) + trace_data_size = sizeof (ioam_trace_if_t); + else if ((trace_type & TRACE_TYPE_TS) == TRACE_TYPE_TS) + trace_data_size = sizeof (ioam_trace_ts_t); + else if ((trace_type & TRACE_TYPE_APP) == TRACE_TYPE_APP) + trace_data_size = sizeof (ioam_trace_app_t); + else if ((trace_type & TRACE_TYPE_TS_APP) == TRACE_TYPE_TS_APP) + trace_data_size = sizeof (ioam_trace_ts_app_t); + + return trace_data_size; +} + +always_inline void +ioam_trace_set_bit (ioam_trace_hdr_t * trace_hdr, u8 trace_bit) +{ + trace_hdr->ioam_trace_type |= trace_bit; +} + +always_inline void +ioam_trace_reset_bit (ioam_trace_hdr_t * trace_hdr, u8 trace_bit) +{ + trace_hdr->ioam_trace_type &= (~trace_bit); +} + +int ioam_trace_get_sizeof_handler (u32 * result); +int ip6_trace_profile_setup (void); +int ip6_trace_profile_cleanup (void); + +#define TSP_SECONDS 0 +#define TSP_MILLISECONDS 1 +#define TSP_MICROSECONDS 2 +#define TSP_NANOSECONDS 3 + +#endif + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/lib-vxlan-gpe/ioam_decap.c b/src/plugins/ioam/lib-vxlan-gpe/ioam_decap.c new file mode 100644 index 00000000..87e57d36 --- /dev/null +++ b/src/plugins/ioam/lib-vxlan-gpe/ioam_decap.c @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vppinfra/error.h> +#include <vppinfra/hash.h> +#include <vnet/vnet.h> +#include <vnet/ip/ip.h> +#include <vnet/ethernet/ethernet.h> +#include <vnet/vxlan-gpe/vxlan_gpe.h> +#include <vnet/vxlan-gpe/vxlan_gpe.h> +#include <ioam/lib-vxlan-gpe/vxlan_gpe_ioam_packet.h> +#include <ioam/lib-vxlan-gpe/vxlan_gpe_ioam.h> +#include <ioam/lib-vxlan-gpe/vxlan_gpe_ioam_util.h> + +/* Statistics (not really errors) */ +#define foreach_vxlan_gpe_decap_ioam_v4_error \ +_(DECAPSULATED, "good packets decapsulated") + +static char *vxlan_gpe_decap_ioam_v4_error_strings[] = { +#define _(sym,string) string, + foreach_vxlan_gpe_decap_ioam_v4_error +#undef _ +}; + +typedef enum +{ +#define _(sym,str) VXLAN_GPE_DECAP_IOAM_V4_ERROR_##sym, + foreach_vxlan_gpe_decap_ioam_v4_error +#undef _ + VXLAN_GPE_DECAP_IOAM_V4_N_ERROR, +} vxlan_gpe_decap_ioam_v4_error_t; + + +always_inline void +vxlan_gpe_decap_ioam_v4_two_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vxlan_gpe_main_t * ngm, + vlib_buffer_t * b0, vlib_buffer_t * b1, + u32 * next0, u32 * next1) +{ + vxlan_gpe_ioam_main_t *hm = &vxlan_gpe_ioam_main; + + *next0 = *next1 = hm->decap_v4_next_override; + vxlan_gpe_encap_decap_ioam_v4_one_inline (vm, node, b0, next0, + VXLAN_GPE_DECAP_IOAM_V4_NEXT_DROP, + 0 /* use_adj */ ); + vxlan_gpe_encap_decap_ioam_v4_one_inline (vm, node, b1, next1, + VXLAN_GPE_DECAP_IOAM_V4_NEXT_DROP, + 0 /* use_adj */ ); +} + + + +static uword +vxlan_gpe_decap_ioam (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame, u8 is_ipv6) +{ + u32 n_left_from, next_index, *from, *to_next; + vxlan_gpe_main_t *ngm = &vxlan_gpe_main; + vxlan_gpe_ioam_main_t *hm = &vxlan_gpe_ioam_main; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next >= 2) + { + u32 bi0, bi1; + vlib_buffer_t *b0, *b1; + u32 next0, next1; + + next0 = next1 = hm->decap_v4_next_override; + + /* Prefetch next iteration. */ + { + vlib_buffer_t *p2, *p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + + CLIB_PREFETCH (p2->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (p3->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD); + } + + bi0 = from[0]; + bi1 = from[1]; + to_next[0] = bi0; + to_next[1] = bi1; + from += 2; + to_next += 2; + n_left_to_next -= 2; + n_left_from -= 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + + vlib_buffer_advance (b0, + -(word) (sizeof (udp_header_t) + + sizeof (ip4_header_t) + + sizeof (vxlan_gpe_header_t))); + vlib_buffer_advance (b1, + -(word) (sizeof (udp_header_t) + + sizeof (ip4_header_t) + + sizeof (vxlan_gpe_header_t))); + + vxlan_gpe_decap_ioam_v4_two_inline (vm, node, ngm, b0, b1, + &next0, &next1); + + + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, to_next, + n_left_to_next, bi0, bi1, next0, + next1); + + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + vxlan_gpe_ioam_v4_trace_t *tr = vlib_add_trace (vm, node, b0, + sizeof (*tr)); + } + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + u32 next0 = hm->decap_v4_next_override; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + + vlib_buffer_advance (b0, + -(word) (sizeof (udp_header_t) + + sizeof (ip4_header_t) + + sizeof (vxlan_gpe_header_t))); + + next0 = hm->decap_v4_next_override; + vxlan_gpe_encap_decap_ioam_v4_one_inline (vm, node, b0, + &next0, + VXLAN_GPE_DECAP_IOAM_V4_NEXT_DROP, + 0 /* use_adj */ ); + + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + vxlan_gpe_ioam_v4_trace_t *tr = vlib_add_trace (vm, node, b0, + sizeof (*tr)); + } + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + return from_frame->n_vectors; +} + + +static uword +vxlan_gpe_decap_ioam_v4 (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return vxlan_gpe_decap_ioam (vm, node, from_frame, 0); +} + + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (vxlan_gpe_decap_ioam_v4_node) = { + .function = vxlan_gpe_decap_ioam_v4, + .name = "vxlan-gpe-decap-ioam-v4", + .vector_size = sizeof (u32), + .format_trace = format_vxlan_gpe_ioam_v4_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .n_errors = ARRAY_LEN(vxlan_gpe_decap_ioam_v4_error_strings), + .error_strings = vxlan_gpe_decap_ioam_v4_error_strings, + + .n_next_nodes = VXLAN_GPE_DECAP_IOAM_V4_N_NEXT, + + .next_nodes = { + [VXLAN_GPE_DECAP_IOAM_V4_NEXT_POP] = "vxlan-gpe-pop-ioam-v4", + [VXLAN_GPE_DECAP_IOAM_V4_NEXT_DROP] = "error-drop", + }, +}; +/* *INDENT-ON* */ + + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/lib-vxlan-gpe/ioam_encap.c b/src/plugins/ioam/lib-vxlan-gpe/ioam_encap.c new file mode 100644 index 00000000..1d156544 --- /dev/null +++ b/src/plugins/ioam/lib-vxlan-gpe/ioam_encap.c @@ -0,0 +1,194 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vppinfra/error.h> +#include <vppinfra/hash.h> +#include <vnet/vnet.h> +#include <vnet/ip/ip.h> +#include <vnet/ethernet/ethernet.h> +#include <vnet/vxlan-gpe/vxlan_gpe.h> +#include <ioam/lib-vxlan-gpe/vxlan_gpe_ioam_packet.h> +#include <ioam/lib-vxlan-gpe/vxlan_gpe_ioam.h> +#include <ioam/lib-vxlan-gpe/vxlan_gpe_ioam_util.h> + +/* Statistics (not really errors) */ +#define foreach_vxlan_gpe_encap_ioam_v4_error \ +_(ENCAPSULATED, "good packets encapsulated") + +static char *vxlan_gpe_encap_ioam_v4_error_strings[] = { +#define _(sym,string) string, + foreach_vxlan_gpe_encap_ioam_v4_error +#undef _ +}; + +typedef enum +{ +#define _(sym,str) VXLAN_GPE_ENCAP_IOAM_V4_ERROR_##sym, + foreach_vxlan_gpe_encap_ioam_v4_error +#undef _ + VXLAN_GPE_ENCAP_IOAM_V4_N_ERROR, +} vxlan_gpe_encap_ioam_v4_error_t; + +typedef enum +{ + VXLAN_GPE_ENCAP_IOAM_V4_NEXT_IP4_LOOKUP, + VXLAN_GPE_ENCAP_IOAM_V4_NEXT_DROP, + VXLAN_GPE_ENCAP_IOAM_V4_N_NEXT +} vxlan_gpe_encap_ioam_v4_next_t; + + +always_inline void +vxlan_gpe_encap_ioam_v4_two_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vxlan_gpe_main_t * ngm, + vlib_buffer_t * b0, vlib_buffer_t * b1, + u32 * next0, u32 * next1) +{ + *next0 = *next1 = VXLAN_GPE_ENCAP_IOAM_V4_NEXT_IP4_LOOKUP; + vxlan_gpe_encap_decap_ioam_v4_one_inline (vm, node, b0, next0, + VXLAN_GPE_ENCAP_IOAM_V4_NEXT_DROP, + 0 /* use_adj */ ); + vxlan_gpe_encap_decap_ioam_v4_one_inline (vm, node, b1, next1, + VXLAN_GPE_ENCAP_IOAM_V4_NEXT_DROP, + 0 /* use_adj */ ); +} + + +static uword +vxlan_gpe_encap_ioam_v4 (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + u32 n_left_from, next_index, *from, *to_next; + vxlan_gpe_main_t *ngm = &vxlan_gpe_main; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next >= 2) + { + u32 bi0, bi1; + vlib_buffer_t *b0, *b1; + u32 next0, next1; + + next0 = next1 = VXLAN_GPE_ENCAP_IOAM_V4_NEXT_IP4_LOOKUP; + + /* Prefetch next iteration. */ + { + vlib_buffer_t *p2, *p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + + CLIB_PREFETCH (p2->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (p3->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD); + } + + bi0 = from[0]; + bi1 = from[1]; + to_next[0] = bi0; + to_next[1] = bi1; + from += 2; + to_next += 2; + n_left_to_next -= 2; + n_left_from -= 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + vxlan_gpe_encap_ioam_v4_two_inline (vm, node, ngm, b0, b1, + &next0, &next1); + + + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, to_next, + n_left_to_next, bi0, bi1, next0, + next1); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + u32 next0 = VXLAN_GPE_ENCAP_IOAM_V4_NEXT_IP4_LOOKUP; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + vxlan_gpe_encap_decap_ioam_v4_one_inline (vm, node, b0, + &next0, + VXLAN_GPE_ENCAP_IOAM_V4_NEXT_DROP, + 0 /* use_adj */ ); + + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + vxlan_gpe_ioam_v4_trace_t *tr = vlib_add_trace (vm, node, b0, + sizeof (*tr)); + } + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + return from_frame->n_vectors; +} + + + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (vxlan_gpe_encap_ioam_v4_node) = { + .function = vxlan_gpe_encap_ioam_v4, + .name = "vxlan-gpe-encap-ioam-v4", + .vector_size = sizeof (u32), + .format_trace = format_vxlan_gpe_ioam_v4_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .n_errors = ARRAY_LEN(vxlan_gpe_encap_ioam_v4_error_strings), + .error_strings = vxlan_gpe_encap_ioam_v4_error_strings, + + .n_next_nodes = VXLAN_GPE_ENCAP_IOAM_V4_N_NEXT, + + .next_nodes = { + [VXLAN_GPE_ENCAP_IOAM_V4_NEXT_IP4_LOOKUP] = "ip4-lookup", + [VXLAN_GPE_ENCAP_IOAM_V4_NEXT_DROP] = "error-drop", + }, +}; +/* *INDENT-ON* */ + + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/lib-vxlan-gpe/ioam_pop.c b/src/plugins/ioam/lib-vxlan-gpe/ioam_pop.c new file mode 100644 index 00000000..7a4580d8 --- /dev/null +++ b/src/plugins/ioam/lib-vxlan-gpe/ioam_pop.c @@ -0,0 +1,353 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vppinfra/error.h> +#include <vppinfra/hash.h> +#include <vnet/vnet.h> +#include <vnet/ip/ip.h> +#include <vnet/ethernet/ethernet.h> +#include <vnet/vxlan-gpe/vxlan_gpe.h> +#include <ioam/lib-vxlan-gpe/vxlan_gpe_ioam.h> + +/* Statistics (not really errors) */ +#define foreach_vxlan_gpe_pop_ioam_v4_error \ +_(POPPED, "good packets popped") + +static char *vxlan_gpe_pop_ioam_v4_error_strings[] = { +#define _(sym,string) string, + foreach_vxlan_gpe_pop_ioam_v4_error +#undef _ +}; + +typedef enum +{ +#define _(sym,str) VXLAN_GPE_POP_IOAM_V4_ERROR_##sym, + foreach_vxlan_gpe_pop_ioam_v4_error +#undef _ + VXLAN_GPE_POP_IOAM_V4_N_ERROR, +} vxlan_gpe_pop_ioam_v4_error_t; + +typedef struct +{ + ioam_trace_t fmt_trace; +} vxlan_gpe_pop_ioam_v4_trace_t; + + +u8 * +format_vxlan_gpe_pop_ioam_v4_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + vxlan_gpe_pop_ioam_v4_trace_t *t1 + = va_arg (*args, vxlan_gpe_pop_ioam_v4_trace_t *); + ioam_trace_t *t = &(t1->fmt_trace); + vxlan_gpe_ioam_option_t *fmt_trace0; + vxlan_gpe_ioam_option_t *opt0, *limit0; + vxlan_gpe_ioam_main_t *hm = &vxlan_gpe_ioam_main; + + u8 type0; + + fmt_trace0 = (vxlan_gpe_ioam_option_t *) t->option_data; + + s = format (s, "VXLAN_GPE_IOAM_POP: next_index %d len %d traced %d", + t->next_index, fmt_trace0->length, t->trace_len); + + opt0 = (vxlan_gpe_ioam_option_t *) (fmt_trace0 + 1); + limit0 = (vxlan_gpe_ioam_option_t *) ((u8 *) fmt_trace0) + t->trace_len; + + while (opt0 < limit0) + { + type0 = opt0->type; + switch (type0) + { + case 0: /* Pad, just stop */ + opt0 = (vxlan_gpe_ioam_option_t *) ((u8 *) opt0) + 1; + break; + + default: + if (hm->trace[type0]) + { + s = (*hm->trace[type0]) (s, opt0); + } + else + { + s = + format (s, "\n unrecognized option %d length %d", type0, + opt0->length); + } + opt0 = + (vxlan_gpe_ioam_option_t *) (((u8 *) opt0) + opt0->length + + sizeof (vxlan_gpe_ioam_option_t)); + break; + } + } + + return s; +} + +always_inline void +vxlan_gpe_ioam_pop_v4 (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_buffer_t * b0) +{ + ip4_header_t *ip0; + udp_header_t *udp_hdr0; + vxlan_gpe_header_t *gpe_hdr0; + vxlan_gpe_ioam_hdr_t *gpe_ioam0; + + ip0 = vlib_buffer_get_current (b0); + + udp_hdr0 = (udp_header_t *) (ip0 + 1); + gpe_hdr0 = (vxlan_gpe_header_t *) (udp_hdr0 + 1); + gpe_ioam0 = (vxlan_gpe_ioam_hdr_t *) (gpe_hdr0 + 1); + + /* Pop the iOAM data */ + vlib_buffer_advance (b0, + (word) (sizeof (udp_header_t) + + sizeof (ip4_header_t) + + sizeof (vxlan_gpe_header_t) + + gpe_ioam0->length)); + + return; +} + + + +always_inline void +vxlan_gpe_pop_ioam_v4_one_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vxlan_gpe_main_t * ngm, + vlib_buffer_t * b0, u32 * next0) +{ + CLIB_UNUSED (ip4_header_t * ip0); + CLIB_UNUSED (udp_header_t * udp_hdr0); + CLIB_UNUSED (vxlan_gpe_header_t * gpe_hdr0); + CLIB_UNUSED (vxlan_gpe_ioam_hdr_t * gpe_ioam0); + CLIB_UNUSED (vxlan_gpe_ioam_option_t * opt0); + CLIB_UNUSED (vxlan_gpe_ioam_option_t * limit0); + vxlan_gpe_ioam_main_t *hm = &vxlan_gpe_ioam_main; + + + /* Pop the iOAM header */ + ip0 = vlib_buffer_get_current (b0); + udp_hdr0 = (udp_header_t *) (ip0 + 1); + gpe_hdr0 = (vxlan_gpe_header_t *) (udp_hdr0 + 1); + gpe_ioam0 = (vxlan_gpe_ioam_hdr_t *) (gpe_hdr0 + 1); + opt0 = (vxlan_gpe_ioam_option_t *) (gpe_ioam0 + 1); + limit0 = (vxlan_gpe_ioam_option_t *) ((u8 *) gpe_ioam0 + gpe_ioam0->length); + + /* + * Basic validity checks + */ + if (gpe_ioam0->length > clib_net_to_host_u16 (ip0->length)) + { + *next0 = VXLAN_GPE_INPUT_NEXT_DROP; + goto trace00; + } + + /* Scan the set of h-b-h options, process ones that we understand */ + while (opt0 < limit0) + { + u8 type0; + type0 = opt0->type; + switch (type0) + { + case 0: /* Pad1 */ + opt0 = (vxlan_gpe_ioam_option_t *) ((u8 *) opt0) + 1; + continue; + case 1: /* PadN */ + break; + default: + if (hm->pop_options[type0]) + { + if ((*hm->pop_options[type0]) (ip0, opt0) < 0) + { + *next0 = VXLAN_GPE_INPUT_NEXT_DROP; + goto trace00; + } + } + break; + } + opt0 = + (vxlan_gpe_ioam_option_t *) (((u8 *) opt0) + opt0->length + + sizeof (vxlan_gpe_ioam_hdr_t)); + } + + + *next0 = + (gpe_ioam0->protocol < VXLAN_GPE_PROTOCOL_MAX) ? + ngm-> + decap_next_node_list[gpe_ioam0->protocol] : VXLAN_GPE_INPUT_NEXT_DROP; + +trace00: + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + vxlan_gpe_pop_ioam_v4_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + u32 trace_len = gpe_ioam0->length; + t->fmt_trace.next_index = *next0; + /* Capture the h-b-h option verbatim */ + trace_len = + trace_len < + ARRAY_LEN (t->fmt_trace. + option_data) ? trace_len : ARRAY_LEN (t->fmt_trace. + option_data); + t->fmt_trace.trace_len = trace_len; + clib_memcpy (&(t->fmt_trace.option_data), gpe_ioam0, trace_len); + } + + /* Remove the iOAM header inside the VxLAN-GPE header */ + vxlan_gpe_ioam_pop_v4 (vm, node, b0); + return; +} + +always_inline void +vxlan_gpe_pop_ioam_v4_two_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vxlan_gpe_main_t * ngm, + vlib_buffer_t * b0, vlib_buffer_t * b1, + u32 * next0, u32 * next1) +{ + + vxlan_gpe_pop_ioam_v4_one_inline (vm, node, ngm, b0, next0); + vxlan_gpe_pop_ioam_v4_one_inline (vm, node, ngm, b1, next1); +} + + + +static uword +vxlan_gpe_pop_ioam (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame, u8 is_ipv6) +{ + u32 n_left_from, next_index, *from, *to_next; + vxlan_gpe_main_t *ngm = &vxlan_gpe_main; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next >= 2) + { + u32 bi0, bi1; + vlib_buffer_t *b0, *b1; + u32 next0, next1; + + /* Prefetch next iteration. */ + { + vlib_buffer_t *p2, *p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + + CLIB_PREFETCH (p2->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (p3->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD); + } + + bi0 = from[0]; + bi1 = from[1]; + to_next[0] = bi0; + to_next[1] = bi1; + from += 2; + to_next += 2; + n_left_to_next -= 2; + n_left_from -= 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + vxlan_gpe_pop_ioam_v4_two_inline (vm, node, ngm, b0, b1, &next0, + &next1); + + + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, to_next, + n_left_to_next, bi0, bi1, next0, + next1); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + u32 next0; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + vxlan_gpe_pop_ioam_v4_one_inline (vm, node, ngm, b0, &next0); + + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + return from_frame->n_vectors; +} + + +static uword +vxlan_gpe_pop_ioam_v4 (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * from_frame) +{ + return vxlan_gpe_pop_ioam (vm, node, from_frame, 0); +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (vxlan_gpe_pop_ioam_v4_node) = { + .function = vxlan_gpe_pop_ioam_v4, + .name = "vxlan-gpe-pop-ioam-v4", + .vector_size = sizeof (u32), + .format_trace = format_vxlan_gpe_pop_ioam_v4_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .n_errors = ARRAY_LEN(vxlan_gpe_pop_ioam_v4_error_strings), + .error_strings = vxlan_gpe_pop_ioam_v4_error_strings, + + .n_next_nodes = VXLAN_GPE_INPUT_N_NEXT, + + .next_nodes = { +#define _(s,n) [VXLAN_GPE_INPUT_NEXT_##s] = n, + foreach_vxlan_gpe_input_next +#undef _ + }, +}; +/* *INDENT-ON* */ + + + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/lib-vxlan-gpe/ioam_transit.c b/src/plugins/ioam/lib-vxlan-gpe/ioam_transit.c new file mode 100644 index 00000000..60eabc22 --- /dev/null +++ b/src/plugins/ioam/lib-vxlan-gpe/ioam_transit.c @@ -0,0 +1,187 @@ + /* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vppinfra/error.h> +#include <vppinfra/hash.h> +#include <vnet/vnet.h> +#include <vnet/ip/ip.h> +#include <vnet/udp/udp.h> +#include <vnet/ethernet/ethernet.h> +#include <vnet/vxlan-gpe/vxlan_gpe.h> +#include <ioam/lib-vxlan-gpe/vxlan_gpe_ioam_packet.h> +#include <ioam/lib-vxlan-gpe/vxlan_gpe_ioam.h> +#include <ioam/lib-vxlan-gpe/vxlan_gpe_ioam_util.h> +#include <vnet/fib/ip4_fib.h> +#include <vnet/fib/fib_entry.h> + +/* Statistics (not really errors) */ +#define foreach_vxlan_gpe_transit_ioam_error \ +_(ENCAPSULATED, "good packets encapsulated") + +static char *vxlan_gpe_transit_ioam_error_strings[] = { +#define _(sym,string) string, + foreach_vxlan_gpe_transit_ioam_error +#undef _ +}; + +typedef enum +{ +#define _(sym,str) VXLAN_GPE_TRANSIT_IOAM_ERROR_##sym, + foreach_vxlan_gpe_transit_ioam_error +#undef _ + VXLAN_GPE_TRANSIT_IOAM_N_ERROR, +} vxlan_gpe_transit_ioam_error_t; + +typedef enum +{ + VXLAN_GPE_TRANSIT_IOAM_NEXT_OUTPUT, + VXLAN_GPE_TRANSIT_IOAM_NEXT_DROP, + VXLAN_GPE_TRANSIT_IOAM_N_NEXT +} vxlan_gpe_transit_ioam_next_t; + + +/* *INDENT-OFF* */ +VNET_FEATURE_INIT (vxlan_gpe_transit_ioam, static) = +{ + .arc_name = "ip4-output", + .node_name = "vxlan-gpe-transit-ioam", + .runs_before = VNET_FEATURES ("interface-output"), +}; +/* *INDENT-ON* */ + +static uword +vxlan_gpe_transit_ioam (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * from_frame) +{ + u32 n_left_from, next_index, *from, *to_next; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + u32 next0 = VXLAN_GPE_TRANSIT_IOAM_NEXT_OUTPUT; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + ip4_header_t *ip0; + u32 iph_offset = 0; + + b0 = vlib_get_buffer (vm, bi0); + iph_offset = vnet_buffer (b0)->ip.save_rewrite_length; + ip0 = (ip4_header_t *) ((u8 *) vlib_buffer_get_current (b0) + + iph_offset); + + /* just forward non ipv4 packets */ + if (PREDICT_FALSE + ((ip0->ip_version_and_header_length & 0xF0) == 0x40)) + { + /* ipv4 packets */ + udp_header_t *udp_hdr0 = (udp_header_t *) (ip0 + 1); + if (PREDICT_FALSE + ((ip0->protocol == IP_PROTOCOL_UDP) && + (clib_net_to_host_u16 (udp_hdr0->dst_port) == + UDP_DST_PORT_VXLAN_GPE))) + { + + /* Check the iOAM header */ + vxlan_gpe_header_t *gpe_hdr0 = + (vxlan_gpe_header_t *) (udp_hdr0 + 1); + + if (PREDICT_FALSE + (gpe_hdr0->protocol == VXLAN_GPE_PROTOCOL_IOAM)) + { + uword *t = NULL; + vxlan_gpe_ioam_main_t *hm = &vxlan_gpe_ioam_main; + fib_prefix_t key4; + memset (&key4, 0, sizeof (key4)); + key4.fp_proto = FIB_PROTOCOL_IP4; + key4.fp_addr.ip4.as_u32 = ip0->dst_address.as_u32; + t = hash_get_mem (hm->dst_by_ip4, &key4); + if (t) + { + + + vlib_buffer_advance (b0, + (word) (sizeof + (ethernet_header_t))); + vxlan_gpe_encap_decap_ioam_v4_one_inline (vm, node, + b0, + &next0, + VXLAN_GPE_TRANSIT_IOAM_NEXT_DROP, + 1 + /* use_adj */ + ); + vlib_buffer_advance (b0, + -(word) (sizeof + (ethernet_header_t))); + } + } + } + } + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + return from_frame->n_vectors; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (vxlan_gpe_transit_ioam_node) = { + .function = vxlan_gpe_transit_ioam, + .name = "vxlan-gpe-transit-ioam", + .vector_size = sizeof (u32), + .format_trace = format_vxlan_gpe_ioam_v4_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .n_errors = ARRAY_LEN(vxlan_gpe_transit_ioam_error_strings), + .error_strings = vxlan_gpe_transit_ioam_error_strings, + + .n_next_nodes = VXLAN_GPE_TRANSIT_IOAM_N_NEXT, + + .next_nodes = { + [VXLAN_GPE_TRANSIT_IOAM_NEXT_OUTPUT] = "interface-output", + [VXLAN_GPE_TRANSIT_IOAM_NEXT_DROP] = "error-drop", + }, + +}; +/* *INDENT-ON* */ + + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/lib-vxlan-gpe/ioam_vxlan_gpe.api b/src/plugins/ioam/lib-vxlan-gpe/ioam_vxlan_gpe.api new file mode 100644 index 00000000..a6761f07 --- /dev/null +++ b/src/plugins/ioam/lib-vxlan-gpe/ioam_vxlan_gpe.api @@ -0,0 +1,111 @@ +/* Hey Emacs use -*- mode: C -*- */ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/** \brief iOAM Over VxLAN-GPE - Set iOAM transport for VxLAN-GPE + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param id - profile id + @param trace_ppc - Trace PPC (none/encap/decap) + @param pow_enable - Proof of Work enabled or not flag + @param trace_enable - iOAM Trace enabled or not flag + +*/ +autoreply define vxlan_gpe_ioam_enable { + u32 client_index; + u32 context; + u16 id; + u8 trace_ppc; + u8 pow_enable; + u8 trace_enable; +}; + +/** \brief iOAM for VxLAN-GPE disable + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param id - profile id +*/ +autoreply define vxlan_gpe_ioam_disable +{ + u32 client_index; + u32 context; + u16 id; +}; + +/** \brief Enable iOAM for a VNI (VXLAN-GPE) + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param vni - VXLAN-GPE VNI + @param local - IPv4/6 Address of the local VTEP + @param remote - IPv4/6 Address of the remote VTEP + +*/ +autoreply define vxlan_gpe_ioam_vni_enable { + u32 client_index; + u32 context; + u32 vni; + u8 local[16]; + u8 remote[16]; + u8 is_ipv6; +}; + +/** \brief Disable iOAM for a VNI (VXLAN-GPE) + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param vni - VXLAN-GPE VNI + @param local - IPv4/6 Address of the local VTEP + @param remote - IPv4/6 Address of the remote VTEP + +*/ +autoreply define vxlan_gpe_ioam_vni_disable { + u32 client_index; + u32 context; + u32 vni; + u8 local[16]; + u8 remote[16]; + u8 is_ipv6; +}; + +/** \brief Enable iOAM for a VXLAN-GPE transit + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param dst_addr - IPv4/6 Address of the local VTEP + @param outer_fib_index- FIB index + +*/ +autoreply define vxlan_gpe_ioam_transit_enable { + u32 client_index; + u32 context; + u32 outer_fib_index; + u8 dst_addr[16]; + u8 is_ipv6; +}; + +/** \brief Disable iOAM for VXLAN-GPE transit + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param dst_addr - IPv4/6 Address of the local VTEP + @param outer_fib_index- FIB index + +*/ +autoreply define vxlan_gpe_ioam_transit_disable { + u32 client_index; + u32 context; + u32 outer_fib_index; + u8 dst_addr[16]; + u8 is_ipv6; +}; + diff --git a/src/plugins/ioam/lib-vxlan-gpe/vxlan_gpe_all_api_h.h b/src/plugins/ioam/lib-vxlan-gpe/vxlan_gpe_all_api_h.h new file mode 100644 index 00000000..06fa0d2c --- /dev/null +++ b/src/plugins/ioam/lib-vxlan-gpe/vxlan_gpe_all_api_h.h @@ -0,0 +1,16 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* Include the generated file, see BUILT_SOURCES in Makefile.am */ +#include <ioam/lib-vxlan-gpe/ioam_vxlan_gpe.api.h> diff --git a/src/plugins/ioam/lib-vxlan-gpe/vxlan_gpe_api.c b/src/plugins/ioam/lib-vxlan-gpe/vxlan_gpe_api.c new file mode 100644 index 00000000..634133a4 --- /dev/null +++ b/src/plugins/ioam/lib-vxlan-gpe/vxlan_gpe_api.c @@ -0,0 +1,377 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + *------------------------------------------------------------------ + * vxlan_gpe_api.c - iOAM VxLAN-GPE related APIs to create + * and maintain profiles + *------------------------------------------------------------------ + */ + +#include <vnet/vnet.h> +#include <vnet/plugin/plugin.h> +#include <ioam/lib-vxlan-gpe/vxlan_gpe_ioam.h> + +#include <vlibapi/api.h> +#include <vlibmemory/api.h> +#include <vlibsocket/api.h> + +/* define message IDs */ +#include <ioam/lib-vxlan-gpe/vxlan_gpe_msg_enum.h> + +/* define message structures */ +#define vl_typedefs +#include <ioam/lib-vxlan-gpe/vxlan_gpe_all_api_h.h> +#undef vl_typedefs + +/* define generated endian-swappers */ +#define vl_endianfun +#include <ioam/lib-vxlan-gpe/vxlan_gpe_all_api_h.h> +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__) +#define vl_printfun +#include <ioam/lib-vxlan-gpe/vxlan_gpe_all_api_h.h> +#undef vl_printfun + +/* Get the API version number */ +#define vl_api_version(n,v) static u32 api_version=(v); +#include <ioam/lib-vxlan-gpe/vxlan_gpe_all_api_h.h> +#undef vl_api_version + +/* + * A handy macro to set up a message reply. + * Assumes that the following variables are available: + * mp - pointer to request message + * rmp - pointer to reply message type + * rv - return value + */ + +#define VXLAN_GPE_REPLY_MACRO(t) \ +do { \ + unix_shared_memory_queue_t * q = \ + vl_api_client_index_to_input_queue (mp->client_index); \ + if (!q) \ + return; \ + \ + rmp = vl_msg_api_alloc (sizeof (*rmp)); \ + rmp->_vl_msg_id = ntohs((t)+sm->msg_id_base); \ + rmp->context = mp->context; \ + rmp->retval = ntohl(rv); \ + \ + vl_msg_api_send_shmem (q, (u8 *)&rmp); \ +} while(0); + +/* *INDENT-OFF* */ +#define VXLAN_GPE_REPLY_MACRO2(t, body) \ +do { \ + unix_shared_memory_queue_t * q; \ + rv = vl_msg_api_pd_handler (mp, rv); \ + q = vl_api_client_index_to_input_queue (mp->client_index); \ + if (!q) \ + return; \ + \ + rmp = vl_msg_api_alloc (sizeof (*rmp)); \ + rmp->_vl_msg_id = ntohs((t)); \ + rmp->context = mp->context; \ + rmp->retval = ntohl(rv); \ + do {body;} while (0); \ + vl_msg_api_send_shmem (q, (u8 *)&rmp); \ +} while(0); +/* *INDENT-ON* */ + +/* List of message types that this plugin understands */ + +#define foreach_vxlan_gpe_plugin_api_msg \ +_(VXLAN_GPE_IOAM_ENABLE, vxlan_gpe_ioam_enable) \ +_(VXLAN_GPE_IOAM_DISABLE, vxlan_gpe_ioam_disable) \ +_(VXLAN_GPE_IOAM_VNI_ENABLE, vxlan_gpe_ioam_vni_enable) \ +_(VXLAN_GPE_IOAM_VNI_DISABLE, vxlan_gpe_ioam_vni_disable) \ +_(VXLAN_GPE_IOAM_TRANSIT_ENABLE, vxlan_gpe_ioam_transit_enable) \ +_(VXLAN_GPE_IOAM_TRANSIT_DISABLE, vxlan_gpe_ioam_transit_disable) \ + + +static void vl_api_vxlan_gpe_ioam_enable_t_handler + (vl_api_vxlan_gpe_ioam_enable_t * mp) +{ + int rv = 0; + vl_api_vxlan_gpe_ioam_enable_reply_t *rmp; + clib_error_t *error; + vxlan_gpe_ioam_main_t *sm = &vxlan_gpe_ioam_main; + + /* Ignoring the profile id as currently a single profile + * is supported */ + error = + vxlan_gpe_ioam_enable (mp->trace_enable, mp->pow_enable, mp->trace_ppc); + if (error) + { + clib_error_report (error); + rv = clib_error_get_code (error); + } + + VXLAN_GPE_REPLY_MACRO (VL_API_VXLAN_GPE_IOAM_ENABLE_REPLY); +} + +static void vl_api_vxlan_gpe_ioam_disable_t_handler + (vl_api_vxlan_gpe_ioam_disable_t * mp) +{ + int rv = 0; + vl_api_vxlan_gpe_ioam_disable_reply_t *rmp; + clib_error_t *error; + vxlan_gpe_ioam_main_t *sm = &vxlan_gpe_ioam_main; + + /* Ignoring the profile id as currently a single profile + * is supported */ + error = vxlan_gpe_ioam_disable (0, 0, 0); + if (error) + { + clib_error_report (error); + rv = clib_error_get_code (error); + } + + VXLAN_GPE_REPLY_MACRO (VL_API_VXLAN_GPE_IOAM_DISABLE_REPLY); +} + +static void vl_api_vxlan_gpe_ioam_vni_enable_t_handler + (vl_api_vxlan_gpe_ioam_vni_enable_t * mp) +{ + int rv = 0; + vl_api_vxlan_gpe_ioam_vni_enable_reply_t *rmp; + clib_error_t *error; + vxlan_gpe_ioam_main_t *sm = &vxlan_gpe_ioam_main; + vxlan4_gpe_tunnel_key_t key4; + uword *p = NULL; + vxlan_gpe_main_t *gm = &vxlan_gpe_main; + vxlan_gpe_tunnel_t *t = 0; + vxlan_gpe_ioam_main_t *hm = &vxlan_gpe_ioam_main; + u32 vni; + + + if (!mp->is_ipv6) + { + clib_memcpy (&key4.local, &mp->local, sizeof (key4.local)); + clib_memcpy (&key4.remote, &mp->remote, sizeof (key4.remote)); + vni = clib_net_to_host_u32 (mp->vni); + key4.vni = clib_host_to_net_u32 (vni << 8); + key4.pad = 0; + + p = hash_get_mem (gm->vxlan4_gpe_tunnel_by_key, &key4); + } + else + { + return; + } + + if (!p) + return; + + t = pool_elt_at_index (gm->tunnels, p[0]); + + error = vxlan_gpe_ioam_set (t, hm->has_trace_option, + hm->has_pot_option, + hm->has_ppc_option, mp->is_ipv6); + + + if (error) + { + clib_error_report (error); + rv = clib_error_get_code (error); + } + + VXLAN_GPE_REPLY_MACRO (VL_API_VXLAN_GPE_IOAM_VNI_ENABLE_REPLY); +} + + +static void vl_api_vxlan_gpe_ioam_vni_disable_t_handler + (vl_api_vxlan_gpe_ioam_vni_disable_t * mp) +{ + int rv = 0; + vl_api_vxlan_gpe_ioam_vni_enable_reply_t *rmp; + clib_error_t *error; + vxlan_gpe_ioam_main_t *sm = &vxlan_gpe_ioam_main; + vxlan4_gpe_tunnel_key_t key4; + uword *p = NULL; + vxlan_gpe_main_t *gm = &vxlan_gpe_main; + vxlan_gpe_tunnel_t *t = 0; + u32 vni; + + + if (!mp->is_ipv6) + { + clib_memcpy (&key4.local, &mp->local, sizeof (key4.local)); + clib_memcpy (&key4.remote, &mp->remote, sizeof (key4.remote)); + vni = clib_net_to_host_u32 (mp->vni); + key4.vni = clib_host_to_net_u32 (vni << 8); + key4.pad = 0; + + p = hash_get_mem (gm->vxlan4_gpe_tunnel_by_key, &key4); + } + else + { + return; + } + + if (!p) + return; + + t = pool_elt_at_index (gm->tunnels, p[0]); + + error = vxlan_gpe_ioam_clear (t, 0, 0, 0, 0); + + + if (error) + { + clib_error_report (error); + rv = clib_error_get_code (error); + } + + + VXLAN_GPE_REPLY_MACRO (VL_API_VXLAN_GPE_IOAM_VNI_DISABLE_REPLY); +} + +static void vl_api_vxlan_gpe_ioam_transit_enable_t_handler + (vl_api_vxlan_gpe_ioam_transit_enable_t * mp) +{ + int rv = 0; + vl_api_vxlan_gpe_ioam_transit_enable_reply_t *rmp; + vxlan_gpe_ioam_main_t *sm = &vxlan_gpe_ioam_main; + ip46_address_t dst_addr; + + memset (&dst_addr.ip4, 0, sizeof (dst_addr.ip4)); + if (!mp->is_ipv6) + { + clib_memcpy (&dst_addr.ip4, &mp->dst_addr, sizeof (dst_addr.ip4)); + } + rv = vxlan_gpe_enable_disable_ioam_for_dest (sm->vlib_main, + dst_addr, + ntohl (mp->outer_fib_index), + mp->is_ipv6 ? 0 : 1, + 1 /* is_add */ ); + + VXLAN_GPE_REPLY_MACRO (VL_API_VXLAN_GPE_IOAM_TRANSIT_ENABLE_REPLY); +} + +static void vl_api_vxlan_gpe_ioam_transit_disable_t_handler + (vl_api_vxlan_gpe_ioam_transit_disable_t * mp) +{ + int rv = 0; + vl_api_vxlan_gpe_ioam_transit_disable_reply_t *rmp; + vxlan_gpe_ioam_main_t *sm = &vxlan_gpe_ioam_main; + ip46_address_t dst_addr; + + memset (&dst_addr.ip4, 0, sizeof (dst_addr.ip4)); + if (!mp->is_ipv6) + { + clib_memcpy (&dst_addr.ip4, &mp->dst_addr, sizeof (dst_addr.ip4)); + } + + rv = vxlan_gpe_ioam_disable_for_dest (sm->vlib_main, + dst_addr, + ntohl (mp->outer_fib_index), + mp->is_ipv6 ? 0 : 1); + VXLAN_GPE_REPLY_MACRO (VL_API_VXLAN_GPE_IOAM_TRANSIT_DISABLE_REPLY); +} + +/* Set up the API message handling tables */ +static clib_error_t * +vxlan_gpe_plugin_api_hookup (vlib_main_t * vm) +{ + vxlan_gpe_ioam_main_t *sm = &vxlan_gpe_ioam_main; +#define _(N,n) \ + vl_msg_api_set_handlers((VL_API_##N + sm->msg_id_base), \ + #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_vxlan_gpe_plugin_api_msg; +#undef _ + + return 0; +} + +#define vl_msg_name_crc_list +#include <ioam/lib-vxlan-gpe/vxlan_gpe_all_api_h.h> +#undef vl_msg_name_crc_list + +static void +setup_message_id_table (vxlan_gpe_ioam_main_t * sm, api_main_t * am) +{ +#define _(id,n,crc) \ + vl_msg_api_add_msg_name_crc (am, #n "_" #crc, id + sm->msg_id_base); + foreach_vl_msg_name_crc_ioam_vxlan_gpe; +#undef _ +} + +static clib_error_t * +vxlan_gpe_init (vlib_main_t * vm) +{ + vxlan_gpe_ioam_main_t *sm = &vxlan_gpe_ioam_main; + clib_error_t *error = 0; + u8 *name; + u32 encap_node_index = vxlan_gpe_encap_ioam_v4_node.index; + u32 decap_node_index = vxlan_gpe_decap_ioam_v4_node.index; + vlib_node_t *vxlan_gpe_encap_node = NULL; + vlib_node_t *vxlan_gpe_decap_node = NULL; + uword next_node = 0; + + sm->vlib_main = vm; + sm->vnet_main = vnet_get_main (); + sm->unix_time_0 = (u32) time (0); /* Store starting time */ + sm->vlib_time_0 = vlib_time_now (vm); + + name = format (0, "ioam_vxlan_gpe_%08x%c", api_version, 0); + + /* Ask for a correctly-sized block of API message decode slots */ + sm->msg_id_base = vl_msg_api_get_msg_ids + ((char *) name, VL_MSG_FIRST_AVAILABLE); + + error = vxlan_gpe_plugin_api_hookup (vm); + + /* Add our API messages to the global name_crc hash table */ + setup_message_id_table (sm, &api_main); + + /* Hook the ioam-encap node to vxlan-gpe-encap */ + vxlan_gpe_encap_node = vlib_get_node_by_name (vm, (u8 *) "vxlan-gpe-encap"); + sm->encap_v4_next_node = + vlib_node_add_next (vm, vxlan_gpe_encap_node->index, encap_node_index); + + vxlan_gpe_decap_node = + vlib_get_node_by_name (vm, (u8 *) "vxlan4-gpe-input"); + next_node = + vlib_node_add_next (vm, vxlan_gpe_decap_node->index, decap_node_index); + vxlan_gpe_register_decap_protocol (VXLAN_GPE_PROTOCOL_IOAM, next_node); + + vec_new (vxlan_gpe_ioam_sw_interface_t, pool_elts (sm->sw_interfaces)); + sm->dst_by_ip4 = hash_create_mem (0, sizeof (fib_prefix_t), sizeof (uword)); + + sm->dst_by_ip6 = hash_create_mem (0, sizeof (fib_prefix_t), sizeof (uword)); + + vxlan_gpe_ioam_interface_init (); + vec_free (name); + + return error; +} + +VLIB_INIT_FUNCTION (vxlan_gpe_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/lib-vxlan-gpe/vxlan_gpe_ioam.c b/src/plugins/ioam/lib-vxlan-gpe/vxlan_gpe_ioam.c new file mode 100644 index 00000000..8558c505 --- /dev/null +++ b/src/plugins/ioam/lib-vxlan-gpe/vxlan_gpe_ioam.c @@ -0,0 +1,770 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vnet/vxlan-gpe/vxlan_gpe.h> +#include <vnet/vxlan-gpe/vxlan_gpe_packet.h> +#include <vnet/ip/format.h> +#include <ioam/lib-vxlan-gpe/vxlan_gpe_ioam.h> +#include <vnet/dpo/load_balance.h> +#include <vnet/fib/ip4_fib.h> +#include <vnet/fib/fib_entry.h> + +vxlan_gpe_ioam_main_t vxlan_gpe_ioam_main; + +int +vxlan_gpe_ioam_set_rewrite (vxlan_gpe_tunnel_t * t, int has_trace_option, + int has_pot_option, int has_ppc_option, + u8 ipv6_set) +{ + vxlan_gpe_ioam_main_t *hm = &vxlan_gpe_ioam_main; + u32 size; + vxlan_gpe_ioam_hdr_t *vxlan_gpe_ioam_hdr; + u8 *current; + u8 trace_data_size = 0; + u8 pot_data_size = 0; + + if (has_trace_option == 0 && has_pot_option == 0) + return -1; + + /* Work out how much space we need */ + size = sizeof (vxlan_gpe_ioam_hdr_t); + + if (has_trace_option + && hm->add_options[VXLAN_GPE_OPTION_TYPE_IOAM_TRACE] != 0) + { + size += sizeof (vxlan_gpe_ioam_option_t); + size += hm->options_size[VXLAN_GPE_OPTION_TYPE_IOAM_TRACE]; + } + if (has_pot_option + && hm->add_options[VXLAN_GPE_OPTION_TYPE_IOAM_PROOF_OF_TRANSIT] != 0) + { + size += sizeof (vxlan_gpe_ioam_option_t); + size += hm->options_size[VXLAN_GPE_OPTION_TYPE_IOAM_PROOF_OF_TRANSIT]; + } + + t->rewrite_size = size; + + if (!ipv6_set) + { + vxlan4_gpe_rewrite (t, size, VXLAN_GPE_PROTOCOL_IOAM, + hm->encap_v4_next_node); + vxlan_gpe_ioam_hdr = + (vxlan_gpe_ioam_hdr_t *) (t->rewrite + + sizeof (ip4_vxlan_gpe_header_t)); + } + else + { + vxlan6_gpe_rewrite (t, size, VXLAN_GPE_PROTOCOL_IOAM, + VXLAN_GPE_ENCAP_NEXT_IP6_LOOKUP); + vxlan_gpe_ioam_hdr = + (vxlan_gpe_ioam_hdr_t *) (t->rewrite + + sizeof (ip6_vxlan_gpe_header_t)); + } + + + vxlan_gpe_ioam_hdr->type = VXLAN_GPE_PROTOCOL_IOAM; + /* Length of the header in octets */ + vxlan_gpe_ioam_hdr->length = size; + vxlan_gpe_ioam_hdr->protocol = t->protocol; + current = (u8 *) vxlan_gpe_ioam_hdr + sizeof (vxlan_gpe_ioam_hdr_t); + + if (has_trace_option + && hm->add_options[VXLAN_GPE_OPTION_TYPE_IOAM_TRACE] != 0) + { + if (0 != hm->add_options[VXLAN_GPE_OPTION_TYPE_IOAM_TRACE] (current, + &trace_data_size)) + return -1; + current += trace_data_size; + } + if (has_pot_option + && hm->add_options[VXLAN_GPE_OPTION_TYPE_IOAM_PROOF_OF_TRANSIT] != 0) + { + pot_data_size = + hm->options_size[VXLAN_GPE_OPTION_TYPE_IOAM_PROOF_OF_TRANSIT]; + if (0 == + hm->add_options[VXLAN_GPE_OPTION_TYPE_IOAM_PROOF_OF_TRANSIT] + (current, &pot_data_size)) + current += pot_data_size; + } + + return 0; +} + +int +vxlan_gpe_ioam_clear_rewrite (vxlan_gpe_tunnel_t * t, int has_trace_option, + int has_pot_option, int has_ppc_option, + u8 ipv6_set) +{ + + t->rewrite_size = 0; + + if (!ipv6_set) + { + vxlan4_gpe_rewrite (t, 0, 0, VXLAN_GPE_ENCAP_NEXT_IP4_LOOKUP); + } + else + { + vxlan6_gpe_rewrite (t, 0, 0, VXLAN_GPE_ENCAP_NEXT_IP6_LOOKUP); + } + + + return 0; +} + +clib_error_t * +vxlan_gpe_ioam_clear (vxlan_gpe_tunnel_t * t, + int has_trace_option, int has_pot_option, + int has_ppc_option, u8 ipv6_set) +{ + int rv; + rv = vxlan_gpe_ioam_clear_rewrite (t, 0, 0, 0, 0); + + if (rv == 0) + { + return (0); + } + else + { + return clib_error_return_code (0, rv, 0, + "vxlan_gpe_ioam_clear_rewrite returned %d", + rv); + } + +} + + +clib_error_t * +vxlan_gpe_ioam_set (vxlan_gpe_tunnel_t * t, + int has_trace_option, int has_pot_option, + int has_ppc_option, u8 ipv6_set) +{ + int rv; + rv = vxlan_gpe_ioam_set_rewrite (t, has_trace_option, + has_pot_option, has_ppc_option, ipv6_set); + + if (rv == 0) + { + return (0); + } + else + { + return clib_error_return_code (0, rv, 0, + "vxlan_gpe_ioam_set_rewrite returned %d", + rv); + } + +} + +static void +vxlan_gpe_set_clear_output_feature_on_intf (vlib_main_t * vm, + u32 sw_if_index0, u8 is_add) +{ + + + + vnet_feature_enable_disable ("ip4-output", "vxlan-gpe-transit-ioam", + sw_if_index0, is_add, + 0 /* void *feature_config */ , + 0 /* u32 n_feature_config_bytes */ ); + return; +} + +void +vxlan_gpe_clear_output_feature_on_all_intfs (vlib_main_t * vm) +{ + vnet_sw_interface_t *si = 0; + vnet_main_t *vnm = vnet_get_main (); + vnet_interface_main_t *im = &vnm->interface_main; + + pool_foreach (si, im->sw_interfaces, ( + { + vxlan_gpe_set_clear_output_feature_on_intf + (vm, si->sw_if_index, 0); + })); + return; +} + + +extern fib_forward_chain_type_t +fib_entry_get_default_chain_type (const fib_entry_t * fib_entry); + +int +vxlan_gpe_enable_disable_ioam_for_dest (vlib_main_t * vm, + ip46_address_t dst_addr, + u32 outer_fib_index, + u8 is_ipv4, u8 is_add) +{ + vxlan_gpe_ioam_main_t *hm = &vxlan_gpe_ioam_main; + u32 fib_index0 = 0; + u32 sw_if_index0 = ~0; + + fib_node_index_t fei = ~0; + fib_entry_t *fib_entry; + u32 adj_index0; + ip_adjacency_t *adj0; + fib_prefix_t fib_prefix; + //fib_forward_chain_type_t fct; + load_balance_t *lb_m, *lb_b; + const dpo_id_t *dpo0, *dpo1; + u32 i, j; + //vnet_hw_interface_t *hw; + + if (is_ipv4) + { + memset (&fib_prefix, 0, sizeof (fib_prefix_t)); + fib_prefix.fp_len = 32; + fib_prefix.fp_proto = FIB_PROTOCOL_IP4; + fib_prefix.fp_addr = dst_addr; + } + else + { + return 0; + } + + fei = fib_table_lookup (fib_index0, &fib_prefix); + fib_entry = fib_entry_get (fei); + + //fct = fib_entry_get_default_chain_type (fib_entry); + + if (!dpo_id_is_valid (&fib_entry->fe_lb /*[fct] */ )) + { + return (-1); + } + + lb_m = load_balance_get (fib_entry->fe_lb /*[fct] */ .dpoi_index); + + for (i = 0; i < lb_m->lb_n_buckets; i++) + { + dpo0 = load_balance_get_bucket_i (lb_m, i); + + if (dpo0->dpoi_type == DPO_LOAD_BALANCE) + { + lb_b = load_balance_get (dpo0->dpoi_index); + + for (j = 0; j < lb_b->lb_n_buckets; j++) + { + dpo1 = load_balance_get_bucket_i (lb_b, j); + + if (dpo1->dpoi_type == DPO_ADJACENCY) + { + adj_index0 = dpo1->dpoi_index; + + if (ADJ_INDEX_INVALID == adj_index0) + { + continue; + } + + adj0 = adj_get (adj_index0); + sw_if_index0 = adj0->rewrite_header.sw_if_index; + + if (~0 == sw_if_index0) + { + continue; + } + + + if (is_add) + { + vnet_feature_enable_disable ("ip4-output", + "vxlan-gpe-transit-ioam", + sw_if_index0, is_add, 0 + /* void *feature_config */ + , 0 /* u32 n_feature_config_bytes */ + ); + + vec_validate_init_empty (hm->bool_ref_by_sw_if_index, + sw_if_index0, ~0); + hm->bool_ref_by_sw_if_index[sw_if_index0] = 1; + } + else + { + hm->bool_ref_by_sw_if_index[sw_if_index0] = ~0; + } + } + } + } + } + + if (is_ipv4) + { + + uword *t = NULL; + vxlan_gpe_ioam_dest_tunnels_t *t1; + fib_prefix_t key4, *key4_copy; + hash_pair_t *hp; + memset (&key4, 0, sizeof (key4)); + key4.fp_proto = FIB_PROTOCOL_IP4; + key4.fp_addr.ip4.as_u32 = fib_prefix.fp_addr.ip4.as_u32; + t = hash_get_mem (hm->dst_by_ip4, &key4); + if (is_add) + { + if (t) + { + return 0; + } + pool_get_aligned (hm->dst_tunnels, t1, CLIB_CACHE_LINE_BYTES); + memset (t1, 0, sizeof (*t1)); + t1->fp_proto = FIB_PROTOCOL_IP4; + t1->dst_addr.ip4.as_u32 = fib_prefix.fp_addr.ip4.as_u32; + key4_copy = clib_mem_alloc (sizeof (*key4_copy)); + clib_memcpy (key4_copy, &key4, sizeof (*key4_copy)); + hash_set_mem (hm->dst_by_ip4, key4_copy, t1 - hm->dst_tunnels); + /* + * Attach to the FIB entry for the VxLAN-GPE destination + * and become its child. The dest route will invoke a callback + * when the fib entry changes, it can be used to + * re-program the output feature on the egress interface. + */ + + const fib_prefix_t tun_dst_pfx = { + .fp_len = 32, + .fp_proto = FIB_PROTOCOL_IP4, + .fp_addr = {.ip4 = t1->dst_addr.ip4,} + }; + + t1->fib_entry_index = + fib_table_entry_special_add (outer_fib_index, + &tun_dst_pfx, + FIB_SOURCE_RR, FIB_ENTRY_FLAG_NONE); + t1->sibling_index = + fib_entry_child_add (t1->fib_entry_index, + hm->fib_entry_type, t1 - hm->dst_tunnels); + t1->outer_fib_index = outer_fib_index; + + } + else + { + if (!t) + { + return 0; + } + t1 = pool_elt_at_index (hm->dst_tunnels, t[0]); + hp = hash_get_pair (hm->dst_by_ip4, &key4); + key4_copy = (void *) (hp->key); + hash_unset_mem (hm->dst_by_ip4, &key4); + clib_mem_free (key4_copy); + pool_put (hm->dst_tunnels, t1); + } + } + else + { + // TBD for IPv6 + } + + return 0; +} + +void +vxlan_gpe_refresh_output_feature_on_all_dest (void) +{ + vxlan_gpe_ioam_main_t *hm = &vxlan_gpe_ioam_main; + vxlan_gpe_ioam_dest_tunnels_t *t; + u32 i; + if (pool_elts (hm->dst_tunnels) == 0) + return; + vxlan_gpe_clear_output_feature_on_all_intfs (hm->vlib_main); + i = vec_len (hm->bool_ref_by_sw_if_index); + vec_free (hm->bool_ref_by_sw_if_index); + vec_validate_init_empty (hm->bool_ref_by_sw_if_index, i, ~0); + pool_foreach (t, hm->dst_tunnels, ( + { + vxlan_gpe_enable_disable_ioam_for_dest + (hm->vlib_main, + t->dst_addr, + t->outer_fib_index, + (t->fp_proto == FIB_PROTOCOL_IP4), 1 + /* is_add */ + ); + } + )); + return; +} + +void +vxlan_gpe_clear_output_feature_on_select_intfs (void) +{ + vxlan_gpe_ioam_main_t *hm = &vxlan_gpe_ioam_main; + u32 sw_if_index0 = 0; + for (sw_if_index0 = 0; + sw_if_index0 < vec_len (hm->bool_ref_by_sw_if_index); sw_if_index0++) + { + if (hm->bool_ref_by_sw_if_index[sw_if_index0] == 0xFF) + { + vxlan_gpe_set_clear_output_feature_on_intf + (hm->vlib_main, sw_if_index0, 0); + } + } + + return; +} + +static clib_error_t * +vxlan_gpe_set_ioam_rewrite_command_fn (vlib_main_t * + vm, + unformat_input_t + * input, vlib_cli_command_t * cmd) +{ + vxlan_gpe_ioam_main_t *hm = &vxlan_gpe_ioam_main; + ip46_address_t local, remote; + u8 local_set = 0; + u8 remote_set = 0; + u8 ipv4_set = 0; + u8 ipv6_set = 0; + u32 vni; + u8 vni_set = 0; + u8 disable = 0; + clib_error_t *rv = 0; + vxlan4_gpe_tunnel_key_t key4; + vxlan6_gpe_tunnel_key_t key6; + uword *p; + vxlan_gpe_main_t *gm = &vxlan_gpe_main; + vxlan_gpe_tunnel_t *t = 0; + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "local %U", unformat_ip4_address, &local.ip4)) + { + local_set = 1; + ipv4_set = 1; + } + else + if (unformat (input, "remote %U", unformat_ip4_address, &remote.ip4)) + { + remote_set = 1; + ipv4_set = 1; + } + else if (unformat (input, "local %U", unformat_ip6_address, &local.ip6)) + { + local_set = 1; + ipv6_set = 1; + } + else + if (unformat (input, "remote %U", unformat_ip6_address, &remote.ip6)) + { + remote_set = 1; + ipv6_set = 1; + } + else if (unformat (input, "vni %d", &vni)) + vni_set = 1; + else if (unformat (input, "disable")) + disable = 1; + else + break; + } + + if (local_set == 0) + return clib_error_return (0, "tunnel local address not specified"); + if (remote_set == 0) + return clib_error_return (0, "tunnel remote address not specified"); + if (ipv4_set && ipv6_set) + return clib_error_return (0, "both IPv4 and IPv6 addresses specified"); + if ((ipv4_set + && memcmp (&local.ip4, &remote.ip4, + sizeof (local.ip4)) == 0) || (ipv6_set + && + memcmp + (&local.ip6, + &remote.ip6, + sizeof (local.ip6)) == 0)) + return clib_error_return (0, "src and dst addresses are identical"); + if (vni_set == 0) + return clib_error_return (0, "vni not specified"); + if (!ipv6_set) + { + key4.local = local.ip4.as_u32; + key4.remote = remote.ip4.as_u32; + key4.vni = clib_host_to_net_u32 (vni << 8); + key4.pad = 0; + p = hash_get_mem (gm->vxlan4_gpe_tunnel_by_key, &key4); + } + else + { + key6.local.as_u64[0] = local.ip6.as_u64[0]; + key6.local.as_u64[1] = local.ip6.as_u64[1]; + key6.remote.as_u64[0] = remote.ip6.as_u64[0]; + key6.remote.as_u64[1] = remote.ip6.as_u64[1]; + key6.vni = clib_host_to_net_u32 (vni << 8); + p = hash_get_mem (gm->vxlan6_gpe_tunnel_by_key, &key6); + } + + if (!p) + return clib_error_return (0, "VxLAN Tunnel not found"); + t = pool_elt_at_index (gm->tunnels, p[0]); + if (!disable) + { + rv = + vxlan_gpe_ioam_set (t, hm->has_trace_option, + hm->has_pot_option, hm->has_ppc_option, ipv6_set); + } + else + { + rv = vxlan_gpe_ioam_clear (t, 0, 0, 0, 0); + } + return rv; +} + + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (vxlan_gpe_set_ioam_rewrite_cmd, static) = { + .path = "set vxlan-gpe-ioam", + .short_help = "set vxlan-gpe-ioam vxlan <src-ip> <dst_ip> <vnid> [disable]", + .function = vxlan_gpe_set_ioam_rewrite_command_fn, +}; +/* *INDENT-ON* */ + + + +clib_error_t * +vxlan_gpe_ioam_enable (int has_trace_option, + int has_pot_option, int has_ppc_option) +{ + vxlan_gpe_ioam_main_t *hm = &vxlan_gpe_ioam_main; + hm->has_trace_option = has_trace_option; + hm->has_pot_option = has_pot_option; + hm->has_ppc_option = has_ppc_option; + if (hm->has_trace_option) + { + vxlan_gpe_trace_profile_setup (); + } + + return 0; +} + +clib_error_t * +vxlan_gpe_ioam_disable (int + has_trace_option, + int has_pot_option, int has_ppc_option) +{ + vxlan_gpe_ioam_main_t *hm = &vxlan_gpe_ioam_main; + hm->has_trace_option = has_trace_option; + hm->has_pot_option = has_pot_option; + hm->has_ppc_option = has_ppc_option; + if (!hm->has_trace_option) + { + vxlan_gpe_trace_profile_cleanup (); + } + + return 0; +} + +void +vxlan_gpe_set_next_override (uword next) +{ + vxlan_gpe_ioam_main_t *hm = &vxlan_gpe_ioam_main; + hm->decap_v4_next_override = next; + return; +} + +static clib_error_t * +vxlan_gpe_set_ioam_flags_command_fn (vlib_main_t * vm, + unformat_input_t + * input, vlib_cli_command_t * cmd) +{ + int has_trace_option = 0; + int has_pot_option = 0; + int has_ppc_option = 0; + clib_error_t *rv = 0; + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "trace")) + has_trace_option = 1; + else if (unformat (input, "pot")) + has_pot_option = 1; + else if (unformat (input, "ppc encap")) + has_ppc_option = PPC_ENCAP; + else if (unformat (input, "ppc decap")) + has_ppc_option = PPC_DECAP; + else if (unformat (input, "ppc none")) + has_ppc_option = PPC_NONE; + else + break; + } + + + rv = + vxlan_gpe_ioam_enable (has_trace_option, has_pot_option, has_ppc_option); + return rv; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (vxlan_gpe_set_ioam_flags_cmd, static) = +{ +.path = "set vxlan-gpe-ioam rewrite", +.short_help = "set vxlan-gpe-ioam [trace] [pot] [ppc <encap|decap>]", +.function = vxlan_gpe_set_ioam_flags_command_fn,}; +/* *INDENT-ON* */ + + +int vxlan_gpe_ioam_disable_for_dest + (vlib_main_t * vm, ip46_address_t dst_addr, u32 outer_fib_index, + u8 ipv4_set) +{ + vxlan_gpe_ioam_dest_tunnels_t *t; + vxlan_gpe_ioam_main_t *hm = &vxlan_gpe_ioam_main; + + vxlan_gpe_enable_disable_ioam_for_dest (hm->vlib_main, + dst_addr, outer_fib_index, ipv4_set, + 0); + if (pool_elts (hm->dst_tunnels) == 0) + { + vxlan_gpe_clear_output_feature_on_select_intfs (); + return 0; + } + + pool_foreach (t, hm->dst_tunnels, ( + { + vxlan_gpe_enable_disable_ioam_for_dest + (hm->vlib_main, + t->dst_addr, + t->outer_fib_index, + (t->fp_proto == + FIB_PROTOCOL_IP4), 1 /* is_add */ ); + } + )); + vxlan_gpe_clear_output_feature_on_select_intfs (); + return (0); + +} + +static clib_error_t *vxlan_gpe_set_ioam_transit_rewrite_command_fn + (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd) +{ + vxlan_gpe_ioam_main_t *hm = &vxlan_gpe_ioam_main; + ip46_address_t dst_addr; + u8 dst_addr_set = 0; + u8 ipv4_set = 0; + u8 ipv6_set = 0; + u8 disable = 0; + clib_error_t *rv = 0; + u32 outer_fib_index = 0; + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "dst-ip %U", unformat_ip4_address, &dst_addr.ip4)) + { + dst_addr_set = 1; + ipv4_set = 1; + } + else + if (unformat + (input, "dst-ip %U", unformat_ip6_address, &dst_addr.ip6)) + { + dst_addr_set = 1; + ipv6_set = 1; + } + else if (unformat (input, "outer-fib-index %d", &outer_fib_index)) + { + } + + else if (unformat (input, "disable")) + disable = 1; + else + break; + } + + if (dst_addr_set == 0) + return clib_error_return (0, "tunnel destination address not specified"); + if (ipv4_set && ipv6_set) + return clib_error_return (0, "both IPv4 and IPv6 addresses specified"); + if (!disable) + { + vxlan_gpe_enable_disable_ioam_for_dest (hm->vlib_main, + dst_addr, outer_fib_index, + ipv4_set, 1); + } + else + { + vxlan_gpe_ioam_disable_for_dest + (vm, dst_addr, outer_fib_index, ipv4_set); + } + return rv; +} + + /* *INDENT-OFF* */ +VLIB_CLI_COMMAND (vxlan_gpe_set_ioam_transit_rewrite_cmd, static) = { + .path = "set vxlan-gpe-ioam-transit", + .short_help = "set vxlan-gpe-ioam-transit dst-ip <dst_ip> [outer-fib-index <outer_fib_index>] [disable]", + .function = vxlan_gpe_set_ioam_transit_rewrite_command_fn, +}; +/* *INDENT-ON* */ + +clib_error_t *clear_vxlan_gpe_ioam_rewrite_command_fn + (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd) +{ + return (vxlan_gpe_ioam_disable (0, 0, 0)); +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (vxlan_gpe_clear_ioam_flags_cmd, static) = +{ +.path = "clear vxlan-gpe-ioam rewrite", +.short_help = "clear vxlan-gpe-ioam rewrite", +.function = clear_vxlan_gpe_ioam_rewrite_command_fn, +}; +/* *INDENT-ON* */ + + +/** + * Function definition to backwalk a FIB node + */ +static fib_node_back_walk_rc_t +vxlan_gpe_ioam_back_walk (fib_node_t * node, fib_node_back_walk_ctx_t * ctx) +{ + vxlan_gpe_refresh_output_feature_on_all_dest (); + return (FIB_NODE_BACK_WALK_CONTINUE); +} + +/** + * Function definition to get a FIB node from its index + */ +static fib_node_t * +vxlan_gpe_ioam_fib_node_get (fib_node_index_t index) +{ + vxlan_gpe_ioam_main_t *hm = &vxlan_gpe_ioam_main; + return (&hm->node); +} + +/** + * Function definition to inform the FIB node that its last lock has gone. + */ +static void +vxlan_gpe_ioam_last_lock_gone (fib_node_t * node) +{ + ASSERT (0); +} + + +/* + * Virtual function table registered by MPLS GRE tunnels + * for participation in the FIB object graph. + */ +const static fib_node_vft_t vxlan_gpe_ioam_vft = { + .fnv_get = vxlan_gpe_ioam_fib_node_get, + .fnv_last_lock = vxlan_gpe_ioam_last_lock_gone, + .fnv_back_walk = vxlan_gpe_ioam_back_walk, +}; + +void +vxlan_gpe_ioam_interface_init (void) +{ + vxlan_gpe_ioam_main_t *hm = &vxlan_gpe_ioam_main; + hm->fib_entry_type = fib_node_register_new_type (&vxlan_gpe_ioam_vft); + return; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/lib-vxlan-gpe/vxlan_gpe_ioam.h b/src/plugins/ioam/lib-vxlan-gpe/vxlan_gpe_ioam.h new file mode 100644 index 00000000..0711b87a --- /dev/null +++ b/src/plugins/ioam/lib-vxlan-gpe/vxlan_gpe_ioam.h @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_vxlan_gpe_ioam_h__ +#define __included_vxlan_gpe_ioam_h__ + +#include <vnet/vxlan-gpe/vxlan_gpe.h> +#include <vnet/vxlan-gpe/vxlan_gpe_packet.h> +#include <ioam/lib-vxlan-gpe/vxlan_gpe_ioam_packet.h> +#include <vnet/ip/ip.h> + + +typedef struct vxlan_gpe_sw_interface_ +{ + u32 sw_if_index; +} vxlan_gpe_ioam_sw_interface_t; + +typedef struct vxlan_gpe_dest_tunnels_ +{ + ip46_address_t dst_addr; + u32 fp_proto; + u32 sibling_index; + fib_node_index_t fib_entry_index; + u32 outer_fib_index; +} vxlan_gpe_ioam_dest_tunnels_t; + +typedef struct vxlan_gpe_ioam_main_ +{ + /** + * Linkage into the FIB object graph + */ + fib_node_t node; + + /* time scale transform. Joy. */ + u32 unix_time_0; + f64 vlib_time_0; + + + /* Trace option */ + u8 has_trace_option; + + /* Pot option */ + u8 has_pot_option; + +#define PPC_NONE 0 +#define PPC_ENCAP 1 +#define PPC_DECAP 2 + u8 has_ppc_option; + +#define TSP_SECONDS 0 +#define TSP_MILLISECONDS 1 +#define TSP_MICROSECONDS 2 +#define TSP_NANOSECONDS 3 + + /* Array of function pointers to ADD and POP VxLAN-GPE iOAM option handling routines */ + u8 options_size[256]; + int (*add_options[256]) (u8 * rewrite_string, u8 * rewrite_size); + int (*pop_options[256]) (ip4_header_t * ip, vxlan_gpe_ioam_option_t * opt); + + /* Array of function pointers to iOAM option handling routines */ + int (*options[256]) (vlib_buffer_t * b, vxlan_gpe_ioam_option_t * opt, + u8 is_ipv4, u8 use_adj); + u8 *(*trace[256]) (u8 * s, vxlan_gpe_ioam_option_t * opt); + + /* API message ID base */ + u16 msg_id_base; + + /* Override to export for iOAM */ + uword decap_v4_next_override; + uword decap_v6_next_override; + + /* sequence of node graph for encap */ + uword encap_v4_next_node; + uword encap_v6_next_node; + + /* Software interfaces. */ + vxlan_gpe_ioam_sw_interface_t *sw_interfaces; + + /* hash ip4/ip6 -> list of destinations for doing transit iOAM operation */ + vxlan_gpe_ioam_dest_tunnels_t *dst_tunnels; + uword *dst_by_ip4; + uword *dst_by_ip6; + + /** per sw_if_index, to maintain bitmap */ + u8 *bool_ref_by_sw_if_index; + fib_node_type_t fib_entry_type; + + /** State convenience vlib_main_t */ + vlib_main_t *vlib_main; + /** State convenience vnet_main_t */ + vnet_main_t *vnet_main; + + +} vxlan_gpe_ioam_main_t; +extern vxlan_gpe_ioam_main_t vxlan_gpe_ioam_main; + +/* + * Primary h-b-h handler trace support + */ +typedef struct +{ + u32 next_index; + u32 trace_len; + u8 option_data[256]; +} ioam_trace_t; + + +extern vlib_node_registration_t vxlan_gpe_encap_ioam_v4_node; +extern vlib_node_registration_t vxlan_gpe_decap_ioam_v4_node; +extern vlib_node_registration_t vxlan_gpe_transit_ioam_v4_node; + +clib_error_t *vxlan_gpe_ioam_enable (int has_trace_option, int has_pot_option, + int has_ppc_option); + +clib_error_t *vxlan_gpe_ioam_disable (int has_trace_option, + int has_pot_option, int has_ppc_option); + +clib_error_t *vxlan_gpe_ioam_set (vxlan_gpe_tunnel_t * t, + int has_trace_option, + int has_pot_option, + int has_ppc_option, u8 ipv6_set); +clib_error_t *vxlan_gpe_ioam_clear (vxlan_gpe_tunnel_t * t, + int has_trace_option, int has_pot_option, + int has_ppc_option, u8 ipv6_set); + +int vxlan_gpe_ioam_add_register_option (u8 option, + u8 size, + int rewrite_options (u8 * + rewrite_string, + u8 * + rewrite_size)); + +int vxlan_gpe_add_unregister_option (u8 option); + +int vxlan_gpe_ioam_register_option (u8 option, + int options (vlib_buffer_t * b, + vxlan_gpe_ioam_option_t * + opt, u8 is_ipv4, u8 use_adj), + u8 * trace (u8 * s, + vxlan_gpe_ioam_option_t * + opt)); +int vxlan_gpe_ioam_unregister_option (u8 option); + +int vxlan_gpe_trace_profile_setup (void); + +int vxlan_gpe_trace_profile_cleanup (void); +extern void vxlan_gpe_ioam_interface_init (void); +int +vxlan_gpe_enable_disable_ioam_for_dest (vlib_main_t * vm, + ip46_address_t dst_addr, + u32 outer_fib_index, + u8 is_ipv4, u8 is_add); +int vxlan_gpe_ioam_disable_for_dest + (vlib_main_t * vm, ip46_address_t dst_addr, u32 outer_fib_index, + u8 ipv4_set); + +typedef enum +{ + VXLAN_GPE_DECAP_IOAM_V4_NEXT_POP, + VXLAN_GPE_DECAP_IOAM_V4_NEXT_DROP, + VXLAN_GPE_DECAP_IOAM_V4_N_NEXT +} vxlan_gpe_decap_ioam_v4_next_t; + +#endif + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/lib-vxlan-gpe/vxlan_gpe_ioam_packet.h b/src/plugins/ioam/lib-vxlan-gpe/vxlan_gpe_ioam_packet.h new file mode 100644 index 00000000..a7ef859e --- /dev/null +++ b/src/plugins/ioam/lib-vxlan-gpe/vxlan_gpe_ioam_packet.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_vxlan_gpe_ioam_packet_h__ +#define __included_vxlan_gpe_ioam_packet_h__ + +#include <vnet/vxlan-gpe/vxlan_gpe.h> +#include <vnet/vxlan-gpe/vxlan_gpe_packet.h> +#include <vnet/ip/ip.h> + + + +#define VXLAN_GPE_OPTION_TYPE_IOAM_TRACE 59 +#define VXLAN_GPE_OPTION_TYPE_IOAM_PROOF_OF_TRANSIT 60 + +/** + * @brief VXLAN GPE Extension (iOAM) Header definition + */ +typedef struct +{ + u8 type; + u8 length; + /** Reserved */ + u8 reserved; + /** see vxlan_gpe_protocol_t */ + u8 protocol; +} vxlan_gpe_ioam_hdr_t; + +/* + * @brief VxLAN GPE iOAM Option definition + */ +typedef struct +{ + /* Option Type */ + u8 type; + /* Length in octets of the option data field */ + u8 length; +} vxlan_gpe_ioam_option_t; + + +#endif + + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/lib-vxlan-gpe/vxlan_gpe_ioam_trace.c b/src/plugins/ioam/lib-vxlan-gpe/vxlan_gpe_ioam_trace.c new file mode 100644 index 00000000..f3d03b67 --- /dev/null +++ b/src/plugins/ioam/lib-vxlan-gpe/vxlan_gpe_ioam_trace.c @@ -0,0 +1,551 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vlib/vlib.h> +#include <vnet/vnet.h> +#include <vnet/pg/pg.h> +#include <vppinfra/error.h> + +#include <vnet/vxlan-gpe/vxlan_gpe.h> +#include <vnet/vxlan-gpe/vxlan_gpe_packet.h> + +#include <vppinfra/hash.h> +#include <vppinfra/error.h> +#include <vppinfra/elog.h> + +#include <ioam/lib-trace/trace_util.h> +#include <ioam/lib-trace/trace_config.h> +#include <ioam/lib-vxlan-gpe/vxlan_gpe_ioam.h> + +/* Timestamp precision multipliers for seconds, milliseconds, microseconds + * and nanoseconds respectively. + */ +static f64 trace_tsp_mul[4] = { 1, 1e3, 1e6, 1e9 }; + +typedef union +{ + u64 as_u64; + u32 as_u32[2]; +} time_u64_t; + + +/* *INDENT-OFF* */ +typedef CLIB_PACKED(struct { + vxlan_gpe_ioam_option_t hdr; + u8 ioam_trace_type; + u8 data_list_elts_left; + u32 elts[0]; /* Variable type. So keep it generic */ +}) vxlan_gpe_ioam_trace_option_t; +/* *INDENT-ON* */ + + +#define foreach_vxlan_gpe_ioam_trace_stats \ + _(SUCCESS, "Pkts updated with TRACE records") \ + _(FAILED, "Errors in TRACE due to lack of TRACE records") + +static char *vxlan_gpe_ioam_trace_stats_strings[] = { +#define _(sym,string) string, + foreach_vxlan_gpe_ioam_trace_stats +#undef _ +}; + +typedef enum +{ +#define _(sym,str) VXLAN_GPE_IOAM_TRACE_##sym, + foreach_vxlan_gpe_ioam_trace_stats +#undef _ + VXLAN_GPE_IOAM_TRACE_N_STATS, +} vxlan_gpe_ioam_trace_stats_t; + + +typedef struct +{ + /* stats */ + u64 counters[ARRAY_LEN (vxlan_gpe_ioam_trace_stats_strings)]; + + /* convenience */ + vlib_main_t *vlib_main; + vnet_main_t *vnet_main; +} vxlan_gpe_ioam_trace_main_t; + +vxlan_gpe_ioam_trace_main_t vxlan_gpe_ioam_trace_main; + +int +vxlan_gpe_ioam_add_register_option (u8 option, + u8 size, + int rewrite_options (u8 * rewrite_string, + u8 * rewrite_size)) +{ + vxlan_gpe_ioam_main_t *hm = &vxlan_gpe_ioam_main; + + ASSERT (option < ARRAY_LEN (hm->add_options)); + + /* Already registered */ + if (hm->add_options[option]) + return (-1); + + hm->add_options[option] = rewrite_options; + hm->options_size[option] = size; + + return (0); +} + +int +vxlan_gpe_add_unregister_option (u8 option) +{ + vxlan_gpe_ioam_main_t *hm = &vxlan_gpe_ioam_main; + + ASSERT (option < ARRAY_LEN (hm->add_options)); + + /* Not registered */ + if (!hm->add_options[option]) + return (-1); + + hm->add_options[option] = NULL; + hm->options_size[option] = 0; + return (0); +} + + +int +vxlan_gpe_ioam_register_option (u8 option, + int options (vlib_buffer_t * b, + vxlan_gpe_ioam_option_t * opt, + u8 is_ipv4, u8 use_adj), + u8 * trace (u8 * s, + vxlan_gpe_ioam_option_t * opt)) +{ + vxlan_gpe_ioam_main_t *im = &vxlan_gpe_ioam_main; + + ASSERT (option < ARRAY_LEN (im->options)); + + /* Already registered */ + if (im->options[option]) + return (-1); + + im->options[option] = options; + im->trace[option] = trace; + + return (0); +} + +int +vxlan_gpe_ioam_unregister_option (u8 option) +{ + vxlan_gpe_ioam_main_t *hm = &vxlan_gpe_ioam_main; + + ASSERT (option < ARRAY_LEN (hm->options)); + + /* Not registered */ + if (!hm->options[option]) + return (-1); + + hm->options[option] = NULL; + hm->trace[option] = NULL; + + return (0); +} + + +always_inline void +vxlan_gpe_ioam_trace_stats_increment_counter (u32 counter_index, + u64 increment) +{ + vxlan_gpe_ioam_trace_main_t *hm = &vxlan_gpe_ioam_trace_main; + + hm->counters[counter_index] += increment; +} + + +static u8 * +format_ioam_data_list_element (u8 * s, va_list * args) +{ + u32 *elt = va_arg (*args, u32 *); + u8 *trace_type_p = va_arg (*args, u8 *); + u8 trace_type = *trace_type_p; + + + if (trace_type & BIT_TTL_NODEID) + { + u32 ttl_node_id_host_byte_order = clib_net_to_host_u32 (*elt); + s = format (s, "ttl 0x%x node id 0x%x ", + ttl_node_id_host_byte_order >> 24, + ttl_node_id_host_byte_order & 0x00FFFFFF); + + elt++; + } + + if (trace_type & BIT_ING_INTERFACE && trace_type & BIT_ING_INTERFACE) + { + u32 ingress_host_byte_order = clib_net_to_host_u32 (*elt); + s = format (s, "ingress 0x%x egress 0x%x ", + ingress_host_byte_order >> 16, + ingress_host_byte_order & 0xFFFF); + elt++; + } + + if (trace_type & BIT_TIMESTAMP) + { + u32 ts_in_host_byte_order = clib_net_to_host_u32 (*elt); + s = format (s, "ts 0x%x \n", ts_in_host_byte_order); + elt++; + } + + if (trace_type & BIT_APPDATA) + { + u32 appdata_in_host_byte_order = clib_net_to_host_u32 (*elt); + s = format (s, "app 0x%x ", appdata_in_host_byte_order); + elt++; + } + + return s; +} + + + +int +vxlan_gpe_ioam_trace_rewrite_handler (u8 * rewrite_string, u8 * rewrite_size) +{ + vxlan_gpe_ioam_trace_option_t *trace_option = NULL; + u8 trace_data_size = 0; + u8 trace_option_elts = 0; + trace_profile *profile = NULL; + + + profile = trace_profile_find (); + + if (PREDICT_FALSE (!profile)) + { + return (-1); + } + + if (PREDICT_FALSE (!rewrite_string)) + return -1; + + trace_option_elts = profile->num_elts; + trace_data_size = fetch_trace_data_size (profile->trace_type); + trace_option = (vxlan_gpe_ioam_trace_option_t *) rewrite_string; + trace_option->hdr.type = VXLAN_GPE_OPTION_TYPE_IOAM_TRACE; + trace_option->hdr.length = 2 /*ioam_trace_type,data_list_elts_left */ + + trace_option_elts * trace_data_size; + trace_option->ioam_trace_type = profile->trace_type & TRACE_TYPE_MASK; + trace_option->data_list_elts_left = trace_option_elts; + *rewrite_size = + sizeof (vxlan_gpe_ioam_trace_option_t) + + (trace_option_elts * trace_data_size); + + return 0; +} + + +int +vxlan_gpe_ioam_trace_data_list_handler (vlib_buffer_t * b, + vxlan_gpe_ioam_option_t * opt, + u8 is_ipv4, u8 use_adj) +{ + u8 elt_index = 0; + vxlan_gpe_ioam_trace_option_t *trace = + (vxlan_gpe_ioam_trace_option_t *) opt; + time_u64_t time_u64; + u32 *elt; + int rv = 0; + trace_profile *profile = NULL; + vxlan_gpe_ioam_main_t *hm = &vxlan_gpe_ioam_main; + + + profile = trace_profile_find (); + + if (PREDICT_FALSE (!profile)) + { + return (-1); + } + + + time_u64.as_u64 = 0; + + if (PREDICT_TRUE (trace->data_list_elts_left)) + { + trace->data_list_elts_left--; + /* fetch_trace_data_size returns in bytes. Convert it to 4-bytes + * to skip to this node's location. + */ + elt_index = + trace->data_list_elts_left * + fetch_trace_data_size (trace->ioam_trace_type) / 4; + elt = &trace->elts[elt_index]; + if (is_ipv4) + { + if (trace->ioam_trace_type & BIT_TTL_NODEID) + { + ip4_header_t *ip0 = vlib_buffer_get_current (b); + /* The transit case is the only case where the TTL decrement happens + * before iOAM processing. For now, use the use_adj flag as an overload. + * We can probably use a separate flag instead of overloading the use_adj flag. + */ + *elt = clib_host_to_net_u32 (((ip0->ttl - 1 + use_adj) << 24) | + profile->node_id); + elt++; + } + + if (trace->ioam_trace_type & BIT_ING_INTERFACE) + { + u16 tx_if = 0; + u32 adj_index = vnet_buffer (b)->ip.adj_index[VLIB_TX]; + + if (use_adj) + { + ip_adjacency_t *adj = adj_get (adj_index); + tx_if = adj->rewrite_header.sw_if_index & 0xFFFF; + } + + *elt = + (vnet_buffer (b)->sw_if_index[VLIB_RX] & 0xFFFF) << 16 | + tx_if; + *elt = clib_host_to_net_u32 (*elt); + elt++; + } + } + else + { + if (trace->ioam_trace_type & BIT_TTL_NODEID) + { + ip6_header_t *ip0 = vlib_buffer_get_current (b); + *elt = clib_host_to_net_u32 ((ip0->hop_limit << 24) | + profile->node_id); + elt++; + } + if (trace->ioam_trace_type & BIT_ING_INTERFACE) + { + u16 tx_if = 0; + u32 adj_index = vnet_buffer (b)->ip.adj_index[VLIB_TX]; + + if (use_adj) + { + ip_adjacency_t *adj = adj_get (adj_index); + tx_if = adj->rewrite_header.sw_if_index & 0xFFFF; + } + + *elt = + (vnet_buffer (b)->sw_if_index[VLIB_RX] & 0xFFFF) << 16 | + tx_if; + *elt = clib_host_to_net_u32 (*elt); + elt++; + } + } + + if (trace->ioam_trace_type & BIT_TIMESTAMP) + { + /* Send least significant 32 bits */ + f64 time_f64 = + (f64) (((f64) hm->unix_time_0) + + (vlib_time_now (hm->vlib_main) - hm->vlib_time_0)); + + time_u64.as_u64 = time_f64 * trace_tsp_mul[profile->trace_tsp]; + *elt = clib_host_to_net_u32 (time_u64.as_u32[0]); + elt++; + } + + if (trace->ioam_trace_type & BIT_APPDATA) + { + /* $$$ set elt0->app_data */ + *elt = clib_host_to_net_u32 (profile->app_data); + elt++; + } + vxlan_gpe_ioam_trace_stats_increment_counter + (VXLAN_GPE_IOAM_TRACE_SUCCESS, 1); + } + else + { + vxlan_gpe_ioam_trace_stats_increment_counter + (VXLAN_GPE_IOAM_TRACE_FAILED, 1); + } + return (rv); +} + +u8 * +vxlan_gpe_ioam_trace_data_list_trace_handler (u8 * s, + vxlan_gpe_ioam_option_t * opt) +{ + vxlan_gpe_ioam_trace_option_t *trace; + u8 trace_data_size_in_words = 0; + u32 *elt; + int elt_index = 0; + + trace = (vxlan_gpe_ioam_trace_option_t *) opt; + s = + format (s, " Trace Type 0x%x , %d elts left\n", trace->ioam_trace_type, + trace->data_list_elts_left); + trace_data_size_in_words = + fetch_trace_data_size (trace->ioam_trace_type) / 4; + elt = &trace->elts[0]; + while ((u8 *) elt < ((u8 *) (&trace->elts[0]) + trace->hdr.length - 2 + /* -2 accounts for ioam_trace_type,elts_left */ )) + { + s = format (s, " [%d] %U\n", elt_index, + format_ioam_data_list_element, + elt, &trace->ioam_trace_type); + elt_index++; + elt += trace_data_size_in_words; + } + return (s); +} + + +static clib_error_t * +vxlan_gpe_show_ioam_trace_cmd_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + vxlan_gpe_ioam_trace_main_t *hm = &vxlan_gpe_ioam_trace_main; + u8 *s = 0; + int i = 0; + + for (i = 0; i < VXLAN_GPE_IOAM_TRACE_N_STATS; i++) + { + s = format (s, " %s - %lu\n", vxlan_gpe_ioam_trace_stats_strings[i], + hm->counters[i]); + } + + vlib_cli_output (vm, "%v", s); + vec_free (s); + return 0; +} + + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (vxlan_gpe_show_ioam_trace_cmd, static) = { + .path = "show ioam vxlan-gpe trace", + .short_help = "iOAM trace statistics", + .function = vxlan_gpe_show_ioam_trace_cmd_fn, +}; +/* *INDENT-ON* */ + + +static clib_error_t * +vxlan_gpe_ioam_trace_init (vlib_main_t * vm) +{ + vxlan_gpe_ioam_trace_main_t *hm = &vxlan_gpe_ioam_trace_main; + clib_error_t *error; + + if ((error = vlib_call_init_function (vm, ip_main_init))) + return (error); + + if ((error = vlib_call_init_function (vm, ip6_lookup_init))) + return error; + + if ((error = vlib_call_init_function (vm, vxlan_gpe_init))) + return (error); + + hm->vlib_main = vm; + hm->vnet_main = vnet_get_main (); + memset (hm->counters, 0, sizeof (hm->counters)); + + if (vxlan_gpe_ioam_register_option + (VXLAN_GPE_OPTION_TYPE_IOAM_TRACE, + vxlan_gpe_ioam_trace_data_list_handler, + vxlan_gpe_ioam_trace_data_list_trace_handler) < 0) + return (clib_error_create + ("registration of VXLAN_GPE_OPTION_TYPE_IOAM_TRACE failed")); + + + if (vxlan_gpe_ioam_add_register_option + (VXLAN_GPE_OPTION_TYPE_IOAM_TRACE, + sizeof (vxlan_gpe_ioam_trace_option_t), + vxlan_gpe_ioam_trace_rewrite_handler) < 0) + return (clib_error_create + ("registration of VXLAN_GPE_OPTION_TYPE_IOAM_TRACE for rewrite failed")); + + + return (0); +} + +VLIB_INIT_FUNCTION (vxlan_gpe_ioam_trace_init); + +int +vxlan_gpe_trace_profile_cleanup (void) +{ + vxlan_gpe_ioam_main_t *hm = &vxlan_gpe_ioam_main; + + hm->options_size[VXLAN_GPE_OPTION_TYPE_IOAM_TRACE] = 0; + + return 0; + +} + +static int +vxlan_gpe_ioam_trace_get_sizeof_handler (u32 * result) +{ + u16 size = 0; + u8 trace_data_size = 0; + trace_profile *profile = NULL; + + *result = 0; + + profile = trace_profile_find (); + + if (PREDICT_FALSE (!profile)) + { + return (-1); + } + + trace_data_size = fetch_trace_data_size (profile->trace_type); + if (PREDICT_FALSE (trace_data_size == 0)) + return VNET_API_ERROR_INVALID_VALUE; + + if (PREDICT_FALSE (profile->num_elts * trace_data_size > 254)) + return VNET_API_ERROR_INVALID_VALUE; + + size += + sizeof (vxlan_gpe_ioam_trace_option_t) + + profile->num_elts * trace_data_size; + *result = size; + + return 0; +} + + +int +vxlan_gpe_trace_profile_setup (void) +{ + u32 trace_size = 0; + vxlan_gpe_ioam_main_t *hm = &vxlan_gpe_ioam_main; + + trace_profile *profile = NULL; + + + profile = trace_profile_find (); + + if (PREDICT_FALSE (!profile)) + { + return (-1); + } + + + if (vxlan_gpe_ioam_trace_get_sizeof_handler (&trace_size) < 0) + return (-1); + + hm->options_size[VXLAN_GPE_OPTION_TYPE_IOAM_TRACE] = trace_size; + + return (0); +} + + + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/lib-vxlan-gpe/vxlan_gpe_ioam_util.h b/src/plugins/ioam/lib-vxlan-gpe/vxlan_gpe_ioam_util.h new file mode 100644 index 00000000..c0ad8d9d --- /dev/null +++ b/src/plugins/ioam/lib-vxlan-gpe/vxlan_gpe_ioam_util.h @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_vxlan_gpe_ioam_util_h__ +#define __included_vxlan_gpe_ioam_util_h__ + +#include <vnet/vxlan-gpe/vxlan_gpe.h> +#include <vnet/vxlan-gpe/vxlan_gpe_packet.h> +#include <vnet/ip/ip.h> + + +typedef struct +{ + u32 tunnel_index; + ioam_trace_t fmt_trace; +} vxlan_gpe_ioam_v4_trace_t; + + +static u8 * +format_vxlan_gpe_ioam_v4_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + vxlan_gpe_ioam_v4_trace_t *t1 = va_arg (*args, vxlan_gpe_ioam_v4_trace_t *); + ioam_trace_t *t = &(t1->fmt_trace); + vxlan_gpe_ioam_option_t *fmt_trace0; + vxlan_gpe_ioam_option_t *opt0, *limit0; + vxlan_gpe_ioam_main_t *hm = &vxlan_gpe_ioam_main; + + u8 type0; + + fmt_trace0 = (vxlan_gpe_ioam_option_t *) t->option_data; + + s = format (s, "VXLAN-GPE-IOAM: next_index %d len %d traced %d", + t->next_index, fmt_trace0->length, t->trace_len); + + opt0 = (vxlan_gpe_ioam_option_t *) (fmt_trace0 + 1); + limit0 = (vxlan_gpe_ioam_option_t *) ((u8 *) fmt_trace0) + t->trace_len; + + while (opt0 < limit0) + { + type0 = opt0->type; + switch (type0) + { + case 0: /* Pad, just stop */ + opt0 = (vxlan_gpe_ioam_option_t *) ((u8 *) opt0) + 1; + break; + + default: + if (hm->trace[type0]) + { + s = (*hm->trace[type0]) (s, opt0); + } + else + { + s = + format (s, "\n unrecognized option %d length %d", type0, + opt0->length); + } + opt0 = + (vxlan_gpe_ioam_option_t *) (((u8 *) opt0) + opt0->length + + sizeof (vxlan_gpe_ioam_option_t)); + break; + } + } + + s = format (s, "VXLAN-GPE-IOAM: tunnel %d", t1->tunnel_index); + return s; +} + + +always_inline void +vxlan_gpe_encap_decap_ioam_v4_one_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_buffer_t * b0, + u32 * next0, u32 drop_node_val, + u8 use_adj) +{ + ip4_header_t *ip0; + udp_header_t *udp_hdr0; + vxlan_gpe_header_t *gpe_hdr0; + vxlan_gpe_ioam_hdr_t *gpe_ioam0; + vxlan_gpe_ioam_option_t *opt0; + vxlan_gpe_ioam_option_t *limit0; + vxlan_gpe_ioam_main_t *hm = &vxlan_gpe_ioam_main; + + /* Populate the iOAM header */ + ip0 = vlib_buffer_get_current (b0); + udp_hdr0 = (udp_header_t *) (ip0 + 1); + gpe_hdr0 = (vxlan_gpe_header_t *) (udp_hdr0 + 1); + gpe_ioam0 = (vxlan_gpe_ioam_hdr_t *) (gpe_hdr0 + 1); + opt0 = (vxlan_gpe_ioam_option_t *) (gpe_ioam0 + 1); + limit0 = (vxlan_gpe_ioam_option_t *) ((u8 *) gpe_ioam0 + gpe_ioam0->length); + + /* + * Basic validity checks + */ + if (gpe_ioam0->length > clib_net_to_host_u16 (ip0->length)) + { + *next0 = drop_node_val; + return; + } + + /* Scan the set of h-b-h options, process ones that we understand */ + while (opt0 < limit0) + { + u8 type0; + type0 = opt0->type; + switch (type0) + { + case 0: /* Pad1 */ + opt0 = (vxlan_gpe_ioam_option_t *) ((u8 *) opt0) + 1; + continue; + case 1: /* PadN */ + break; + default: + if (hm->options[type0]) + { + if ((*hm->options[type0]) (b0, opt0, 1 /* is_ipv4 */ , + use_adj) < 0) + { + *next0 = drop_node_val; + return; + } + } + break; + } + opt0 = + (vxlan_gpe_ioam_option_t *) (((u8 *) opt0) + opt0->length + + sizeof (vxlan_gpe_ioam_hdr_t)); + } + + + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + vxlan_gpe_ioam_v4_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + u32 trace_len = gpe_ioam0->length; + t->fmt_trace.next_index = *next0; + /* Capture the ioam option verbatim */ + trace_len = + trace_len < + ARRAY_LEN (t->fmt_trace. + option_data) ? trace_len : ARRAY_LEN (t->fmt_trace. + option_data); + t->fmt_trace.trace_len = trace_len; + clib_memcpy (&(t->fmt_trace.option_data), gpe_ioam0, trace_len); + } + return; +} + + +#endif + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/lib-vxlan-gpe/vxlan_gpe_msg_enum.h b/src/plugins/ioam/lib-vxlan-gpe/vxlan_gpe_msg_enum.h new file mode 100644 index 00000000..cc0a10a3 --- /dev/null +++ b/src/plugins/ioam/lib-vxlan-gpe/vxlan_gpe_msg_enum.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef included_vxlan_gpe_msg_enum_h +#define included_vxlan_gpe_msg_enum_h + +#include <vppinfra/byte_order.h> + +#define vl_msg_id(n,h) n, +typedef enum { +#include <ioam/lib-vxlan-gpe/vxlan_gpe_all_api_h.h> + /* We'll want to know how many messages IDs we need... */ + VL_MSG_FIRST_AVAILABLE, +} vl_msg_id_t; +#undef vl_msg_id + +#endif /* included_vxlan_gpe_msg_enum_h */ diff --git a/src/plugins/ioam/lib-vxlan-gpe/vxlan_gpe_test.c b/src/plugins/ioam/lib-vxlan-gpe/vxlan_gpe_test.c new file mode 100644 index 00000000..80e65644 --- /dev/null +++ b/src/plugins/ioam/lib-vxlan-gpe/vxlan_gpe_test.c @@ -0,0 +1,548 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + *------------------------------------------------------------------ + * vxlan_gpe_test.c - test harness for vxlan_gpe plugin + *------------------------------------------------------------------ + */ + +#include <vat/vat.h> +#include <vlibapi/api.h> +#include <vlibmemory/api.h> +#include <vlibsocket/api.h> +#include <vppinfra/error.h> + +#define __plugin_msg_base vxlan_gpe_test_main.msg_id_base +#include <vlibapi/vat_helper_macros.h> + +/* Declare message IDs */ +#include <ioam/lib-vxlan-gpe/vxlan_gpe_msg_enum.h> + +/* define message structures */ +#define vl_typedefs +#include <ioam/lib-vxlan-gpe/vxlan_gpe_all_api_h.h> +#undef vl_typedefs + +/* declare message handlers for each api */ + +#define vl_endianfun /* define message structures */ +#include <ioam/lib-vxlan-gpe/vxlan_gpe_all_api_h.h> +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) +#define vl_printfun +#include <ioam/lib-vxlan-gpe/vxlan_gpe_all_api_h.h> +#undef vl_printfun + +/* Get the API version number. */ +#define vl_api_version(n,v) static u32 api_version=(v); +#include <ioam/lib-vxlan-gpe/vxlan_gpe_all_api_h.h> +#undef vl_api_version +#include <ioam/lib-vxlan-gpe/vxlan_gpe_ioam_packet.h> +#include <ioam/lib-vxlan-gpe/vxlan_gpe_ioam.h> + +typedef struct +{ + /* API message ID base */ + u16 msg_id_base; + vat_main_t *vat_main; +} vxlan_gpe_test_main_t; + +vxlan_gpe_test_main_t vxlan_gpe_test_main; + +#define foreach_standard_reply_retval_handler \ +_(vxlan_gpe_ioam_enable_reply) \ +_(vxlan_gpe_ioam_disable_reply) \ +_(vxlan_gpe_ioam_vni_enable_reply) \ +_(vxlan_gpe_ioam_vni_disable_reply) \ +_(vxlan_gpe_ioam_transit_enable_reply) \ +_(vxlan_gpe_ioam_transit_disable_reply) + +#define _(n) \ + static void vl_api_##n##_t_handler \ + (vl_api_##n##_t * mp) \ + { \ + vat_main_t * vam = vxlan_gpe_test_main.vat_main; \ + i32 retval = ntohl(mp->retval); \ + if (vam->async_mode) { \ + vam->async_errors += (retval < 0); \ + } else { \ + vam->retval = retval; \ + vam->result_ready = 1; \ + } \ + } +foreach_standard_reply_retval_handler; +#undef _ + +/* + * Table of message reply handlers, must include boilerplate handlers + * we just generated + */ +#define foreach_vpe_api_reply_msg \ +_(VXLAN_GPE_IOAM_ENABLE_REPLY, vxlan_gpe_ioam_enable_reply) \ +_(VXLAN_GPE_IOAM_DISABLE_REPLY, vxlan_gpe_ioam_disable_reply) \ +_(VXLAN_GPE_IOAM_VNI_ENABLE_REPLY, vxlan_gpe_ioam_vni_enable_reply) \ +_(VXLAN_GPE_IOAM_VNI_DISABLE_REPLY, vxlan_gpe_ioam_vni_disable_reply) \ +_(VXLAN_GPE_IOAM_TRANSIT_ENABLE_REPLY, vxlan_gpe_ioam_transit_enable_reply) \ +_(VXLAN_GPE_IOAM_TRANSIT_DISABLE_REPLY, vxlan_gpe_ioam_transit_disable_reply) \ + +static int +api_vxlan_gpe_ioam_enable (vat_main_t * vam) +{ + unformat_input_t *input = vam->input; + vl_api_vxlan_gpe_ioam_enable_t *mp; + u32 id = 0; + int has_trace_option = 0; + int has_pow_option = 0; + int has_ppc_option = 0; + int ret; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "trace")) + has_trace_option = 1; + else if (unformat (input, "pow")) + has_pow_option = 1; + else if (unformat (input, "ppc encap")) + has_ppc_option = PPC_ENCAP; + else if (unformat (input, "ppc decap")) + has_ppc_option = PPC_DECAP; + else if (unformat (input, "ppc none")) + has_ppc_option = PPC_NONE; + else + break; + } + M (VXLAN_GPE_IOAM_ENABLE, mp); + mp->id = htons (id); + mp->trace_ppc = has_ppc_option; + mp->pow_enable = has_pow_option; + mp->trace_enable = has_trace_option; + + + S (mp); + W (ret); + return ret; +} + + +static int +api_vxlan_gpe_ioam_disable (vat_main_t * vam) +{ + vl_api_vxlan_gpe_ioam_disable_t *mp; + int ret; + + M (VXLAN_GPE_IOAM_DISABLE, mp); + S (mp); + W (ret); + return ret; +} + +static int +api_vxlan_gpe_ioam_vni_enable (vat_main_t * vam) +{ + unformat_input_t *line_input = vam->input; + vl_api_vxlan_gpe_ioam_vni_enable_t *mp; + ip4_address_t local4, remote4; + ip6_address_t local6, remote6; + u8 ipv4_set = 0, ipv6_set = 0; + u8 local_set = 0; + u8 remote_set = 0; + u32 vni; + u8 vni_set = 0; + int ret; + + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "local %U", unformat_ip4_address, &local4)) + { + local_set = 1; + ipv4_set = 1; + } + else if (unformat (line_input, "remote %U", + unformat_ip4_address, &remote4)) + { + remote_set = 1; + ipv4_set = 1; + } + else if (unformat (line_input, "local %U", + unformat_ip6_address, &local6)) + { + local_set = 1; + ipv6_set = 1; + } + else if (unformat (line_input, "remote %U", + unformat_ip6_address, &remote6)) + { + remote_set = 1; + ipv6_set = 1; + } + + else if (unformat (line_input, "vni %d", &vni)) + vni_set = 1; + else + { + errmsg ("parse error '%U'\n", format_unformat_error, line_input); + return -99; + } + } + + if (local_set == 0) + { + errmsg ("tunnel local address not specified\n"); + return -99; + } + if (remote_set == 0) + { + errmsg ("tunnel remote address not specified\n"); + return -99; + } + if (ipv4_set && ipv6_set) + { + errmsg ("both IPv4 and IPv6 addresses specified"); + return -99; + } + + if (vni_set == 0) + { + errmsg ("vni not specified\n"); + return -99; + } + + M (VXLAN_GPE_IOAM_VNI_ENABLE, mp); + + + if (ipv6_set) + { + clib_memcpy (&mp->local, &local6, sizeof (local6)); + clib_memcpy (&mp->remote, &remote6, sizeof (remote6)); + } + else + { + clib_memcpy (&mp->local, &local4, sizeof (local4)); + clib_memcpy (&mp->remote, &remote4, sizeof (remote4)); + } + + mp->vni = ntohl (vni); + mp->is_ipv6 = ipv6_set; + + S (mp); + W (ret); + return ret; +} + +static int +api_vxlan_gpe_ioam_vni_disable (vat_main_t * vam) +{ + unformat_input_t *line_input = vam->input; + vl_api_vxlan_gpe_ioam_vni_disable_t *mp; + ip4_address_t local4, remote4; + ip6_address_t local6, remote6; + u8 ipv4_set = 0, ipv6_set = 0; + u8 local_set = 0; + u8 remote_set = 0; + u32 vni; + u8 vni_set = 0; + int ret; + + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "local %U", unformat_ip4_address, &local4)) + { + local_set = 1; + ipv4_set = 1; + } + else if (unformat (line_input, "remote %U", + unformat_ip4_address, &remote4)) + { + remote_set = 1; + ipv4_set = 1; + } + else if (unformat (line_input, "local %U", + unformat_ip6_address, &local6)) + { + local_set = 1; + ipv6_set = 1; + } + else if (unformat (line_input, "remote %U", + unformat_ip6_address, &remote6)) + { + remote_set = 1; + ipv6_set = 1; + } + + else if (unformat (line_input, "vni %d", &vni)) + vni_set = 1; + else + { + errmsg ("parse error '%U'\n", format_unformat_error, line_input); + return -99; + } + } + + if (local_set == 0) + { + errmsg ("tunnel local address not specified\n"); + return -99; + } + if (remote_set == 0) + { + errmsg ("tunnel remote address not specified\n"); + return -99; + } + if (ipv4_set && ipv6_set) + { + errmsg ("both IPv4 and IPv6 addresses specified"); + return -99; + } + + if (vni_set == 0) + { + errmsg ("vni not specified\n"); + return -99; + } + + M (VXLAN_GPE_IOAM_VNI_DISABLE, mp); + + + if (ipv6_set) + { + clib_memcpy (&mp->local, &local6, sizeof (local6)); + clib_memcpy (&mp->remote, &remote6, sizeof (remote6)); + } + else + { + clib_memcpy (&mp->local, &local4, sizeof (local4)); + clib_memcpy (&mp->remote, &remote4, sizeof (remote4)); + } + + mp->vni = ntohl (vni); + mp->is_ipv6 = ipv6_set; + + S (mp); + W (ret); + return ret; +} + +static int +api_vxlan_gpe_ioam_transit_enable (vat_main_t * vam) +{ + unformat_input_t *line_input = vam->input; + vl_api_vxlan_gpe_ioam_transit_enable_t *mp; + ip4_address_t local4; + ip6_address_t local6; + u8 ipv4_set = 0, ipv6_set = 0; + u8 local_set = 0; + u32 outer_fib_index = 0; + int ret; + + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "dst-ip %U", unformat_ip4_address, &local4)) + { + local_set = 1; + ipv4_set = 1; + } + else if (unformat (line_input, "dst-ip %U", + unformat_ip6_address, &local6)) + { + local_set = 1; + ipv6_set = 1; + } + + else if (unformat (line_input, "outer-fib-index %d", &outer_fib_index)) + ; + else + { + errmsg ("parse error '%U'\n", format_unformat_error, line_input); + return -99; + } + } + + if (local_set == 0) + { + errmsg ("destination address not specified\n"); + return -99; + } + if (ipv4_set && ipv6_set) + { + errmsg ("both IPv4 and IPv6 addresses specified"); + return -99; + } + + + M (VXLAN_GPE_IOAM_TRANSIT_ENABLE, mp); + + + if (ipv6_set) + { + errmsg ("IPv6 currently unsupported"); + return -1; + } + else + { + clib_memcpy (&mp->dst_addr, &local4, sizeof (local4)); + } + + mp->outer_fib_index = htonl (outer_fib_index); + mp->is_ipv6 = ipv6_set; + + S (mp); + W (ret); + return ret; +} + +static int +api_vxlan_gpe_ioam_transit_disable (vat_main_t * vam) +{ + unformat_input_t *line_input = vam->input; + vl_api_vxlan_gpe_ioam_transit_disable_t *mp; + ip4_address_t local4; + ip6_address_t local6; + u8 ipv4_set = 0, ipv6_set = 0; + u8 local_set = 0; + u32 outer_fib_index = 0; + int ret; + + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "dst-ip %U", unformat_ip4_address, &local4)) + { + local_set = 1; + ipv4_set = 1; + } + else if (unformat (line_input, "dst-ip %U", + unformat_ip6_address, &local6)) + { + local_set = 1; + ipv6_set = 1; + } + + else if (unformat (line_input, "outer-fib-index %d", &outer_fib_index)) + ; + else + { + errmsg ("parse error '%U'\n", format_unformat_error, line_input); + return -99; + } + } + + if (local_set == 0) + { + errmsg ("destination address not specified\n"); + return -99; + } + if (ipv4_set && ipv6_set) + { + errmsg ("both IPv4 and IPv6 addresses specified"); + return -99; + } + + + M (VXLAN_GPE_IOAM_TRANSIT_DISABLE, mp); + + + if (ipv6_set) + { + return -1; + } + else + { + clib_memcpy (&mp->dst_addr, &local4, sizeof (local4)); + } + + mp->outer_fib_index = htonl (outer_fib_index); + mp->is_ipv6 = ipv6_set; + + S (mp); + W (ret); + return ret; +} + +/* + * List of messages that the api test plugin sends, + * and that the data plane plugin processes + */ +#define foreach_vpe_api_msg \ +_(vxlan_gpe_ioam_enable, ""\ + "[trace] [pow] [ppc <encap|ppc decap>]") \ +_(vxlan_gpe_ioam_disable, "") \ +_(vxlan_gpe_ioam_vni_enable, ""\ + "local <local_vtep_ip> remote <remote_vtep_ip> vni <vnid>") \ +_(vxlan_gpe_ioam_vni_disable, ""\ + "local <local_vtep_ip> remote <remote_vtep_ip> vni <vnid>") \ +_(vxlan_gpe_ioam_transit_enable, ""\ + "dst-ip <dst_ip> [outer-fib-index <outer_fib_index>]") \ +_(vxlan_gpe_ioam_transit_disable, ""\ + "dst-ip <dst_ip> [outer-fib-index <outer_fib_index>]") \ + + +static void +vxlan_gpe_vat_api_hookup (vat_main_t * vam) +{ + vxlan_gpe_test_main_t *sm = &vxlan_gpe_test_main; + /* Hook up handlers for replies from the data plane plug-in */ +#define _(N,n) \ + vl_msg_api_set_handlers((VL_API_##N + sm->msg_id_base), \ + #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_vpe_api_reply_msg; +#undef _ + + /* API messages we can send */ +#define _(n,h) hash_set_mem (vam->function_by_name, #n, api_##n); + foreach_vpe_api_msg; +#undef _ + + /* Help strings */ +#define _(n,h) hash_set_mem (vam->help_by_name, #n, h); + foreach_vpe_api_msg; +#undef _ +} + +clib_error_t * +vat_plugin_register (vat_main_t * vam) +{ + vxlan_gpe_test_main_t *sm = &vxlan_gpe_test_main; + u8 *name; + + sm->vat_main = vam; + + name = format (0, "ioam_vxlan_gpe_%08x%c", api_version, 0); + sm->msg_id_base = vl_client_get_first_plugin_msg_id ((char *) name); + + if (sm->msg_id_base != (u16) ~ 0) + vxlan_gpe_vat_api_hookup (vam); + + vec_free (name); + + return 0; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/udp-ping/udp_ping.api b/src/plugins/ioam/udp-ping/udp_ping.api new file mode 100644 index 00000000..87945816 --- /dev/null +++ b/src/plugins/ioam/udp-ping/udp_ping.api @@ -0,0 +1,73 @@ +/* Hey Emacs use -*- mode: C -*- */ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/** \brief UDP-Probe Add/Delete request + @param src_ip_address - Source ipv4/v6 address for the udp-ping flow + @param dst_ip_address - Destination ipv4/v6 address for the udp-ping flow + @param start_src_port - Starting source port of port range for udp-ping + @param end_src_port - End source port of port range for udp-ping + @param start_dst_port - Starting destination port of port range for udp-ping + @param end_dst_port - End destination port of port range for udp-ping + @param interval - Time interval in seconds at which udp-probe need to be sent + @param is_ipv4 - To determine whether IPv4 or IPv6 address is used + @param dis - TRUE is delete, FALSE if Add +*/ +define udp_ping_add_del_req { + u32 client_index; + u32 context; + u8 src_ip_address[16]; + u8 dst_ip_address[16]; + u16 start_src_port; + u16 end_src_port; + u16 start_dst_port; + u16 end_dst_port; + u16 interval; + u8 is_ipv4; + u8 dis; + u8 fault_det; + u8 reserve[3]; +}; + +/** \brief Udp-probe add/del response + @param context - sender context, to match reply w/ request + @param retval - return value for request +*/ +define udp_ping_add_del_reply { + u32 context; + i32 retval; +}; + +/** \brief Udp-probe export add/del request + @param context - sender context, to match reply w/ request + @param retval - return value for request + @param enable - If TRUE then enable export else disable +*/ +define udp_ping_export_req { + u32 client_index; + u32 context; + u32 enable; +}; + +/** \brief Udp-probe export add/del response + @param context - sender context, to match reply w/ request + @param retval - return value for request +*/ +define udp_ping_export_reply { + u32 context; + i32 retval; +}; + diff --git a/src/plugins/ioam/udp-ping/udp_ping.h b/src/plugins/ioam/udp-ping/udp_ping.h new file mode 100644 index 00000000..26c42019 --- /dev/null +++ b/src/plugins/ioam/udp-ping/udp_ping.h @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PLUGINS_IOAM_PLUGIN_IOAM_UDP_PING_UDP_PING_H_ +#define PLUGINS_IOAM_PLUGIN_IOAM_UDP_PING_UDP_PING_H_ + +#include <ioam/analyse/ioam_analyse.h> + +#define MAX_PING_RETRIES 5 + +#define EVENT_SIG_RECHECK 2 + +/** @brief udp-ping session data. + @note cache aligned. +*/ +typedef struct +{ + /** UDP ping packet */ + u8 *ping_rewrite; + + /** Ping packet rewrite string len. */ + u16 rewrite_len; + + /** Number of times ping response was dropped. + * If retry > MAX_PING_RETRIES then declare connectivity as down. + */ + u16 retry; + + u16 reserve[2]; + + /** Analysed data. */ + ioam_analyser_data_t analyse_data; + + /** This is used by ioam e2e for identifying flow and add seq number. */ + u32 flow_ctx; + + /** No of packets sent for this flow. */ + u32 pak_sent; +} udp_ping_flow_data; + +/** @brief udp-ping flow data. + @note cache aligned. +*/ +typedef struct +{ + /** Time at which next udp-ping probe has to be sent out. */ + f64 next_send_time; + + /** Interval for which ping packet to be sent. */ + u16 interval; + + u16 reserve[3]; + + /** Defines start port of the src port range. */ + u16 start_src_port; + + /** Defines end port of the src port range. */ + u16 end_src_port; + + /** Defines start port of the dest port range. */ + u16 start_dst_port; + + /** Defines end port of the dest port range. */ + u16 end_dst_port; + + /** Ping statistics. */ + udp_ping_flow_data *stats; + +} udp_ping_flow; + +/** @brief udp-ping data. +*/ +typedef struct +{ + /** Local source IPv4/6 address to be used. */ + ip46_address_t src; + + /** Remote destination IPv4/6 address to be used. */ + ip46_address_t dst; + + /** Per flow data. */ + udp_ping_flow udp_data; + + /** To enable fault detection/isolation in network. */ + u8 fault_det; +} ip46_udp_ping_flow; + +/** @brief udp-ping main data-structure. +*/ +typedef struct +{ + /** Vector od udp-ping data */ + ip46_udp_ping_flow *ip46_flow; + + /** Stores the time interval at which process node has to wake up. */ + u64 timer_interval; + + /** Pointer to VLib main for the node - ipfix-collector. */ + vlib_main_t *vlib_main; + + /** Pointer to vnet main for convenience. */ + vnet_main_t *vnet_main; + + /** API message ID base */ + u16 msg_id_base; +} udp_ping_main_t; + +extern udp_ping_main_t udp_ping_main; + +void +ip46_udp_ping_set_flow (ip46_address_t src, ip46_address_t dst, + u16 start_src_port, u16 end_src_port, + u16 start_dst_port, u16 end_dst_port, + u16 interval, u8 fault_det, u8 is_disable); + +clib_error_t *udp_ping_flow_create (u8 del); + +#endif /* PLUGINS_IOAM_PLUGIN_IOAM_UDP_PING_UDP_PING_H_ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/udp-ping/udp_ping_all_api_h.h b/src/plugins/ioam/udp-ping/udp_ping_all_api_h.h new file mode 100644 index 00000000..1ed2e10b --- /dev/null +++ b/src/plugins/ioam/udp-ping/udp_ping_all_api_h.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* Include the generated file, see BUILT_SOURCES in Makefile.am */ +#include <ioam/udp-ping/udp_ping.api.h> + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/udp-ping/udp_ping_api.c b/src/plugins/ioam/udp-ping/udp_ping_api.c new file mode 100644 index 00000000..75938731 --- /dev/null +++ b/src/plugins/ioam/udp-ping/udp_ping_api.c @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + *------------------------------------------------------------------ + * udp_ping_api.c - UDP Ping related APIs to create + * and maintain ping flows + *------------------------------------------------------------------ + */ + +#include <vnet/vnet.h> +#include <vnet/ip/ip.h> +#include <ioam/udp-ping/udp_ping.h> + +#include <vlibapi/api.h> +#include <vlibmemory/api.h> +#include <vlibsocket/api.h> + +/* define message IDs */ +#include <ioam/udp-ping/udp_ping_msg_enum.h> + +/* define message structures */ +#define vl_typedefs +#include <ioam/udp-ping/udp_ping_all_api_h.h> +#undef vl_typedefs + +/* define generated endian-swappers */ +#define vl_endianfun +#include <ioam/udp-ping/udp_ping_all_api_h.h> +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__) +#define vl_printfun +#include <ioam/udp-ping/udp_ping_all_api_h.h> +#undef vl_printfun + +/* Get the API version number */ +#define vl_api_version(n,v) static u32 api_version=(v); +#include <ioam/udp-ping/udp_ping_all_api_h.h> +#undef vl_api_version + +#define REPLY_MSG_ID_BASE sm->msg_id_base +#include <vlibapi/api_helper_macros.h> + +/* List of message types that this module understands */ +#define foreach_udp_ping_api_msg \ + _(UDP_PING_ADD_DEL_REQ, udp_ping_add_del_req) \ + _(UDP_PING_EXPORT_REQ, udp_ping_export_req) \ + +static void vl_api_udp_ping_add_del_req_t_handler + (vl_api_udp_ping_add_del_req_t * mp) +{ + ip46_address_t dst, src; + int rv = 0; + udp_ping_main_t *sm = &udp_ping_main; + vl_api_udp_ping_add_del_reply_t *rmp; + + if (mp->is_ipv4) + { + rv = -1; //Not supported + goto ERROROUT; + } + + clib_memcpy ((void *) &src.ip6, (void *) mp->src_ip_address, + sizeof (ip6_address_t)); + clib_memcpy ((void *) &dst.ip6, (void *) mp->dst_ip_address, + sizeof (ip6_address_t)); + + ip46_udp_ping_set_flow (src, dst, + ntohs (mp->start_src_port), + ntohs (mp->end_src_port), + ntohs (mp->start_dst_port), + ntohs (mp->end_dst_port), + ntohs (mp->interval), mp->fault_det, mp->dis); + rv = 0; //FIXME + +ERROROUT: + REPLY_MACRO (VL_API_UDP_PING_ADD_DEL_REPLY); +} + +static void vl_api_udp_ping_export_req_t_handler + (vl_api_udp_ping_export_req_t * mp) +{ + udp_ping_main_t *sm = &udp_ping_main; + int rv = 0; + vl_api_udp_ping_export_reply_t *rmp; + + (void) udp_ping_flow_create (!mp->enable); + rv = 0; //FIXME + + REPLY_MACRO (VL_API_UDP_PING_EXPORT_REPLY); +} + +/* Set up the API message handling tables */ +static clib_error_t * +udp_ping_api_hookup (vlib_main_t * vm) +{ + udp_ping_main_t *sm = &udp_ping_main; +#define _(N,n) \ + vl_msg_api_set_handlers((VL_API_##N + sm->msg_id_base), \ + #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_udp_ping_api_msg; +#undef _ + + return 0; +} + +#define vl_msg_name_crc_list +#include <ioam/udp-ping/udp_ping_all_api_h.h> +#undef vl_msg_name_crc_list + +static void +setup_message_id_table (udp_ping_main_t * sm, api_main_t * am) +{ +#define _(id,n,crc) \ + vl_msg_api_add_msg_name_crc (am, #n "_" #crc, id + sm->msg_id_base); + foreach_vl_msg_name_crc_udp_ping; +#undef _ +} + +static clib_error_t * +udp_ping_api_init (vlib_main_t * vm) +{ + udp_ping_main_t *sm = &udp_ping_main; + clib_error_t *error = 0; + u8 *name; + + name = format (0, "udp_ping_%08x%c", api_version, 0); + + /* Ask for a correctly-sized block of API message decode slots */ + sm->msg_id_base = vl_msg_api_get_msg_ids + ((char *) name, VL_MSG_FIRST_AVAILABLE); + + error = udp_ping_api_hookup (vm); + + /* Add our API messages to the global name_crc hash table */ + setup_message_id_table (sm, &api_main); + + vec_free (name); + + return error; +} + +VLIB_INIT_FUNCTION (udp_ping_api_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/udp-ping/udp_ping_export.c b/src/plugins/ioam/udp-ping/udp_ping_export.c new file mode 100644 index 00000000..91cbb4bd --- /dev/null +++ b/src/plugins/ioam/udp-ping/udp_ping_export.c @@ -0,0 +1,306 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vnet/flow/flow_report.h> +#include <ioam/analyse/ioam_summary_export.h> +#include <vnet/api_errno.h> +#include <ioam/udp-ping/udp_ping.h> + +#define UDP_PING_EXPORT_RECORD_SIZE 400 + +static u8 * +udp_ping_template_rewrite (flow_report_main_t * frm, flow_report_t * fr, + ip4_address_t * collector_address, + ip4_address_t * src_address, u16 collector_port) +{ + return ioam_template_rewrite (frm, fr, collector_address, + src_address, collector_port); +} + +static vlib_frame_t * +udp_ping_send_flows (flow_report_main_t * frm, flow_report_t * fr, + vlib_frame_t * f, u32 * to_next, u32 node_index) +{ + vlib_buffer_t *b0 = NULL; + u32 next_offset = 0; + u32 bi0 = ~0; + int i, j; + ip4_ipfix_template_packet_t *tp; + ipfix_message_header_t *h; + ipfix_set_header_t *s = NULL; + ip4_header_t *ip; + udp_header_t *udp; + u32 records_this_buffer; + u16 new_l0, old_l0; + ip_csum_t sum0; + vlib_main_t *vm = frm->vlib_main; + flow_report_stream_t *stream; + udp_ping_flow_data *stats; + ip46_udp_ping_flow *ip46_flow; + u16 src_port, dst_port; + u16 data_len; + + stream = &frm->streams[fr->stream_index]; + data_len = vec_len (udp_ping_main.ip46_flow); + + for (i = 0; i < data_len; i++) + { + if (pool_is_free_index (udp_ping_main.ip46_flow, i)) + continue; + + ip46_flow = pool_elt_at_index (udp_ping_main.ip46_flow, i); + j = 0; + for (src_port = ip46_flow->udp_data.start_src_port; + src_port <= ip46_flow->udp_data.end_src_port; src_port++) + { + for (dst_port = ip46_flow->udp_data.start_dst_port; + dst_port <= ip46_flow->udp_data.end_dst_port; dst_port++, j++) + { + stats = ip46_flow->udp_data.stats + j; + if (PREDICT_FALSE (b0 == NULL)) + { + if (vlib_buffer_alloc (vm, &bi0, 1) != 1) + break; + + + b0 = vlib_get_buffer (vm, bi0); + memcpy (b0->data, fr->rewrite, vec_len (fr->rewrite)); + b0->current_data = 0; + b0->current_length = vec_len (fr->rewrite); + b0->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID; + vnet_buffer (b0)->sw_if_index[VLIB_RX] = 0; + vnet_buffer (b0)->sw_if_index[VLIB_TX] = ~0; + + tp = vlib_buffer_get_current (b0); + ip = &tp->ip4; + h = &tp->ipfix.h; + s = &tp->ipfix.s; + + /* FIXUP: message header export_time */ + h->export_time = clib_host_to_net_u32 (((u32) time (NULL))); + + /* FIXUP: message header sequence_number */ + h->sequence_number = stream->sequence_number++; + h->sequence_number = + clib_host_to_net_u32 (h->sequence_number); + next_offset = (u32) (((u8 *) (s + 1)) - (u8 *) tp); + records_this_buffer = 0; + } + + next_offset = ioam_analyse_add_ipfix_record (fr, + &stats->analyse_data, + b0, next_offset, + &ip46_flow-> + src.ip6, + &ip46_flow-> + dst.ip6, src_port, + dst_port); + + //u32 pak_sent = clib_host_to_net_u32(stats->pak_sent); + //memcpy (b0->data + next_offset, &pak_sent, sizeof(u32)); + //next_offset += sizeof(u32); + + records_this_buffer++; + + /* Flush data if packet len is about to reach path mtu */ + if (next_offset > (frm->path_mtu - UDP_PING_EXPORT_RECORD_SIZE)) + { + b0->current_length = next_offset; + b0->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID; + tp = vlib_buffer_get_current (b0); + ip = (ip4_header_t *) & tp->ip4; + udp = (udp_header_t *) (ip + 1); + h = &tp->ipfix.h; + s = &tp->ipfix.s; + + s->set_id_length = + ipfix_set_id_length (IOAM_FLOW_TEMPLATE_ID, + next_offset - (sizeof (*ip) + + sizeof (*udp) + + sizeof (*h))); + h->version_length = + version_length (next_offset - + (sizeof (*ip) + sizeof (*udp))); + + sum0 = ip->checksum; + old_l0 = ip->length; + new_l0 = clib_host_to_net_u16 ((u16) next_offset); + sum0 = ip_csum_update (sum0, old_l0, new_l0, ip4_header_t, + length /* changed member */ ); + + ip->checksum = ip_csum_fold (sum0); + ip->length = new_l0; + udp->length = + clib_host_to_net_u16 (b0->current_length - sizeof (*ip)); + + udp->checksum = ip4_tcp_udp_compute_checksum (vm, b0, ip); + if (udp->checksum == 0) + udp->checksum = 0xffff; + + ASSERT (ip->checksum == ip4_header_checksum (ip)); + + to_next[0] = bi0; + f->n_vectors++; + to_next++; + + if (f->n_vectors == VLIB_FRAME_SIZE) + { + vlib_put_frame_to_node (vm, node_index, f); + f = vlib_get_frame_to_node (vm, node_index); + f->n_vectors = 0; + to_next = vlib_frame_vector_args (f); + } + b0 = 0; + bi0 = ~0; + } + } + } + } + + if (b0) + { + b0->current_length = next_offset; + b0->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID; + tp = vlib_buffer_get_current (b0); + ip = (ip4_header_t *) & tp->ip4; + udp = (udp_header_t *) (ip + 1); + h = &tp->ipfix.h; + s = &tp->ipfix.s; + + s->set_id_length = ipfix_set_id_length (IOAM_FLOW_TEMPLATE_ID, + next_offset - (sizeof (*ip) + + sizeof (*udp) + + sizeof (*h))); + h->version_length = + version_length (next_offset - (sizeof (*ip) + sizeof (*udp))); + + sum0 = ip->checksum; + old_l0 = ip->length; + new_l0 = clib_host_to_net_u16 ((u16) next_offset); + sum0 = ip_csum_update (sum0, old_l0, new_l0, ip4_header_t, + length /* changed member */ ); + + ip->checksum = ip_csum_fold (sum0); + ip->length = new_l0; + udp->length = clib_host_to_net_u16 (b0->current_length - sizeof (*ip)); + + udp->checksum = ip4_tcp_udp_compute_checksum (vm, b0, ip); + if (udp->checksum == 0) + udp->checksum = 0xffff; + + ASSERT (ip->checksum == ip4_header_checksum (ip)); + + to_next[0] = bi0; + f->n_vectors++; + to_next++; + + if (f->n_vectors == VLIB_FRAME_SIZE) + { + vlib_put_frame_to_node (vm, node_index, f); + f = vlib_get_frame_to_node (vm, node_index); + f->n_vectors = 0; + to_next = vlib_frame_vector_args (f); + } + b0 = 0; + bi0 = ~0; + } + return f; +} + +clib_error_t * +udp_ping_flow_create (u8 del) +{ + vnet_flow_report_add_del_args_t args; + int rv; + u32 domain_id = 0; + flow_report_main_t *frm = &flow_report_main; + u16 template_id; + + memset (&args, 0, sizeof (args)); + args.rewrite_callback = udp_ping_template_rewrite; + args.flow_data_callback = udp_ping_send_flows; + del ? (args.is_add = 0) : (args.is_add = 1); + args.domain_id = domain_id; + args.src_port = UDP_DST_PORT_ipfix; + + rv = vnet_flow_report_add_del (frm, &args, &template_id); + + switch (rv) + { + case 0: + break; + case VNET_API_ERROR_NO_SUCH_ENTRY: + return clib_error_return (0, "registration not found..."); + default: + return clib_error_return (0, "vnet_flow_report_add_del returned %d", + rv); + } + + return 0; +} + +static clib_error_t * +set_udp_ping_export_command_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + //int rv; + int is_add = 1; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "export-ipfix")) + is_add = 1; + else if (unformat (input, "disable")) + is_add = 0; + else + break; + } + + if (is_add) + (void) udp_ping_flow_create (0); + else + (void) udp_ping_flow_create (1); + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (set_udp_ping_export_command, static) = { + .path = "set udp-ping export-ipfix", + .short_help = "set udp-ping export-ipfix [disable]", + .function = set_udp_ping_export_command_fn, +}; +/* *INDENT-ON* */ + +clib_error_t * +udp_ping_flow_report_init (vlib_main_t * vm) +{ + clib_error_t *error; + + if ((error = vlib_call_init_function (vm, flow_report_init))) + return error; + + return 0; +} + +VLIB_INIT_FUNCTION (udp_ping_flow_report_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/udp-ping/udp_ping_msg_enum.h b/src/plugins/ioam/udp-ping/udp_ping_msg_enum.h new file mode 100644 index 00000000..dded1884 --- /dev/null +++ b/src/plugins/ioam/udp-ping/udp_ping_msg_enum.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef included_udp_ping_msg_enum_h +#define included_udp_ping_msg_enum_h + +#include <vppinfra/byte_order.h> + +#define vl_msg_id(n,h) n, +typedef enum +{ +#include <ioam/udp-ping/udp_ping_all_api_h.h> + /* We'll want to know how many messages IDs we need... */ + VL_MSG_FIRST_AVAILABLE, +} vl_msg_id_t; +#undef vl_msg_id + +#endif /* included_udp_ping_msg_enum_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/udp-ping/udp_ping_node.c b/src/plugins/ioam/udp-ping/udp_ping_node.c new file mode 100644 index 00000000..e1a57955 --- /dev/null +++ b/src/plugins/ioam/udp-ping/udp_ping_node.c @@ -0,0 +1,814 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vnet/vnet.h> +#include <vlib/vlib.h> +#include <vlibapi/api.h> +#include <vlibmemory/api.h> +#include <vlibsocket/api.h> +#include <vnet/ip/ip.h> +#include <vnet/ip/ip6_hop_by_hop.h> +#include <ioam/encap/ip6_ioam_trace.h> +#include <ioam/encap/ip6_ioam_e2e.h> +#include <ioam/udp-ping/udp_ping_packet.h> +#include <ioam/udp-ping/udp_ping.h> +#include <ioam/udp-ping/udp_ping_util.h> +#include <vnet/srv6/sr_packet.h> + +typedef enum +{ + UDP_PING_NEXT_DROP, + UDP_PING_NEXT_PUNT, + UDP_PING_NEXT_UDP_LOOKUP, + UDP_PING_NEXT_ICMP, + UDP_PING_NEXT_IP6_LOOKUP, + UDP_PING_NEXT_IP6_DROP, + UDP_PING_N_NEXT, +} udp_ping_next_t; + +udp_ping_main_t udp_ping_main; + +uword +udp_ping_process (vlib_main_t * vm, + vlib_node_runtime_t * rt, vlib_frame_t * f); + +extern int +ip6_hbh_ioam_trace_data_list_handler (vlib_buffer_t * b, ip6_header_t * ip, + ip6_hop_by_hop_option_t * opt); + +typedef struct +{ + ip6_address_t src; + ip6_address_t dst; + u16 src_port; + u16 dst_port; + u16 handle; + u16 next_index; + u8 msg_type; +} udp_ping_trace_t; + +/* packet trace format function */ +static u8 * +format_udp_ping_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + udp_ping_trace_t *t = va_arg (*args, udp_ping_trace_t *); + + s = format (s, "udp-ping-local: src %U, dst %U, src_port %u, dst_port %u " + "handle %u, next_index %u, msg_type %u", + format_ip6_address, &t->src, + format_ip6_address, &t->dst, + t->src_port, t->dst_port, + t->handle, t->next_index, t->msg_type); + return s; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (udp_ping_node, static) = +{ + .function = udp_ping_process, + .type = VLIB_NODE_TYPE_PROCESS, + .name = "udp-ping-process", +}; +/* *INDENT-ON* */ + +void +udp_ping_calculate_timer_interval (void) +{ + int i; + ip46_udp_ping_flow *flow = NULL; + u16 min_interval = 0x1e9; + + for (i = 0; i < vec_len (udp_ping_main.ip46_flow); i++) + { + if (pool_is_free_index (udp_ping_main.ip46_flow, i)) + continue; + + flow = pool_elt_at_index (udp_ping_main.ip46_flow, i); + + if (min_interval > flow->udp_data.interval) + min_interval = flow->udp_data.interval; + } + + if (udp_ping_main.timer_interval != min_interval) + { + udp_ping_main.timer_interval = min_interval; + vlib_process_signal_event (udp_ping_main.vlib_main, + udp_ping_node.index, EVENT_SIG_RECHECK, 0); + } +} + +void +ip46_udp_ping_set_flow (ip46_address_t src, ip46_address_t dst, + u16 start_src_port, u16 end_src_port, + u16 start_dst_port, u16 end_dst_port, + u16 interval, u8 fault_det, u8 is_disable) +{ + u8 found = 0; + ip46_udp_ping_flow *flow = NULL; + int i; + + for (i = 0; i < vec_len (udp_ping_main.ip46_flow); i++) + { + if (pool_is_free_index (udp_ping_main.ip46_flow, i)) + continue; + + flow = pool_elt_at_index (udp_ping_main.ip46_flow, i); + if ((0 == udp_ping_compare_flow (src, dst, + start_src_port, end_src_port, + start_dst_port, end_dst_port, flow))) + { + found = 1; + break; + } + } + + if (found) + { + u16 cur_interval; + if (is_disable) + { + cur_interval = flow->udp_data.interval; + udp_ping_free_flow_data (flow); + pool_put_index (udp_ping_main.ip46_flow, i); + if (udp_ping_main.timer_interval == interval) + udp_ping_calculate_timer_interval (); + return; + } + + cur_interval = flow->udp_data.interval; + flow->udp_data.interval = interval; + if (udp_ping_main.timer_interval > interval) + { + udp_ping_main.timer_interval = interval; + vlib_process_signal_event (udp_ping_main.vlib_main, + udp_ping_node.index, + EVENT_SIG_RECHECK, 0); + } + else if (udp_ping_main.timer_interval == cur_interval) + udp_ping_calculate_timer_interval (); + + return; + } + + /* Delete operation and item not found */ + if (is_disable) + return; + + /* Alloc new session */ + pool_get_aligned (udp_ping_main.ip46_flow, flow, CLIB_CACHE_LINE_BYTES); + udp_ping_populate_flow (src, dst, + start_src_port, end_src_port, + start_dst_port, end_dst_port, + interval, fault_det, flow); + + udp_ping_create_rewrite (flow, (flow - udp_ping_main.ip46_flow)); + + if (udp_ping_main.timer_interval > interval) + { + udp_ping_main.timer_interval = interval; + vlib_process_signal_event (udp_ping_main.vlib_main, + udp_ping_node.index, EVENT_SIG_RECHECK, 0); + } + return; +} + +uword +unformat_port_range (unformat_input_t * input, va_list * args) +{ + u16 *start_port, *end_port; + uword c; + u8 colon_present = 0; + + start_port = va_arg (*args, u16 *); + end_port = va_arg (*args, u16 *); + + *start_port = *end_port = 0; + /* Get start port */ + while ((c = unformat_get_input (input)) != UNFORMAT_END_OF_INPUT) + { + switch (c) + { + case '0' ... '9': + *start_port = ((*start_port) * 10) + (c - '0'); + break; + + case ':': + colon_present = 1; + break; + + default: + return 0; + } + + if (colon_present) + break; + } + + if (!colon_present) + return 0; + + /* Get end port */ + while ((c = unformat_get_input (input)) != UNFORMAT_END_OF_INPUT) + { + switch (c) + { + case '0' ... '9': + *end_port = ((*end_port) * 10) + (c - '0'); + break; + + default: + return 1; + } + } + + if (end_port < start_port) + return 0; + + return 1; +} + +static clib_error_t * +set_udp_ping_command_fn (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + ip46_address_t dst, src; + u16 start_src_port, end_src_port; + u16 start_dst_port, end_dst_port; + u32 interval; + u8 is_disable = 0; + u8 fault_det = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat + (input, "src %U", unformat_ip46_address, &src, IP46_TYPE_ANY)) + ; + else if (unformat (input, "src-port-range %U", + unformat_port_range, &start_src_port, &end_src_port)) + ; + else + if (unformat + (input, "dst %U", unformat_ip46_address, &dst, IP46_TYPE_ANY)) + ; + else if (unformat (input, "dst-port-range %U", + unformat_port_range, &start_dst_port, &end_dst_port)) + ; + else if (unformat (input, "interval %d", &interval)) + ; + else if (unformat (input, "fault-detect")) + fault_det = 1; + else if (unformat (input, "disable")) + is_disable = 1; + else + break; + } + + ip46_udp_ping_set_flow (src, dst, start_src_port, end_src_port, + start_dst_port, end_dst_port, (u16) interval, + fault_det, is_disable); + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (set_udp_ping_command, static) = +{ + .path = "set udp-ping", + .short_help = + "set udp-ping src <local IPv6 address> src-port-range <local port range> \ + dst <remote IPv6 address> dst-port-range <destination port range> \ + interval <time interval in sec for which ping packet will be sent> \ + [disable]", + .function = set_udp_ping_command_fn, +}; +/* *INDENT-ON* */ + +static clib_error_t * +show_udp_ping_summary_cmd_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + u8 *s = 0; + int i, j; + ip46_udp_ping_flow *ip46_flow; + u16 src_port, dst_port; + udp_ping_flow_data *stats; + + s = format (s, "UDP-Ping data:\n"); + + for (i = 0; i < vec_len (udp_ping_main.ip46_flow); i++) + { + if (pool_is_free_index (udp_ping_main.ip46_flow, i)) + continue; + + ip46_flow = pool_elt_at_index (udp_ping_main.ip46_flow, i); + s = format (s, "Src: %U, Dst: %U\n", + format_ip46_address, &ip46_flow->src, IP46_TYPE_ANY, + format_ip46_address, &ip46_flow->dst, IP46_TYPE_ANY); + + s = format (s, "Start src port: %u, End src port: %u\n", + ip46_flow->udp_data.start_src_port, + ip46_flow->udp_data.end_src_port); + s = format (s, "Start dst port: %u, End dst port: %u\n", + ip46_flow->udp_data.start_dst_port, + ip46_flow->udp_data.end_dst_port); + s = format (s, "Interval: %u\n", ip46_flow->udp_data.interval); + + j = 0; + for (src_port = ip46_flow->udp_data.start_src_port; + src_port <= ip46_flow->udp_data.end_src_port; src_port++) + { + for (dst_port = ip46_flow->udp_data.start_dst_port; + dst_port <= ip46_flow->udp_data.end_dst_port; dst_port++) + { + stats = ip46_flow->udp_data.stats + j; + s = + format (s, "\nSrc Port - %u, Dst Port - %u, Flow CTX - %u\n", + src_port, dst_port, stats->flow_ctx); + s = + format (s, "Path State - %s\n", + (stats->retry > MAX_PING_RETRIES) ? "Down" : "Up"); + s = format (s, "Path Data:\n"); + s = print_analyse_flow (s, + &ip46_flow->udp_data. + stats[j].analyse_data); + j++; + } + } + s = format (s, "\n\n"); + } + + vlib_cli_output (vm, "%v", s); + vec_free (s); + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_udp_ping_cmd, static) = +{ + .path = "show udp-ping summary", + .short_help = "Summary of udp-ping", + .function = show_udp_ping_summary_cmd_fn, +}; +/* *INDENT-ON* */ + +/** + * @brief UDP-Ping Process node. + * @node udp-ping-process + * + * This is process node which wakes up when periodically to send + * out udp probe packets for all configured sessions. + * + * @param vm vlib_main_t corresponding to the current thread. + * @param node vlib_node_runtime_t data for this node. + * @param frame vlib_frame_t whose contents should be dispatched. + * + */ +uword +udp_ping_process (vlib_main_t * vm, + vlib_node_runtime_t * rt, vlib_frame_t * f) +{ + f64 now; + uword *event_data = 0; + int i; + ip46_udp_ping_flow *ip46_flow; + + while (1) + { + vec_reset_length (event_data); + vlib_process_wait_for_event_or_clock (vm, udp_ping_main.timer_interval); + (void) vlib_process_get_events (vm, &event_data); + now = vlib_time_now (vm); + + for (i = 0; i < vec_len (udp_ping_main.ip46_flow); i++) + { + if (pool_is_free_index (udp_ping_main.ip46_flow, i)) + continue; + + ip46_flow = pool_elt_at_index (udp_ping_main.ip46_flow, i); + if (ip46_flow->udp_data.next_send_time < now) + udp_ping_send_ip6_pak (udp_ping_main.vlib_main, ip46_flow); + } + } + return 0; +} + +/** + * @brief HopByHop analyse function for udp-ping response. + * + * Walks through all hbh options present in udp-ping response + * and uses analyser library for the analysis. + * + */ +void +udp_ping_analyse_hbh (vlib_buffer_t * b0, + u32 flow_id, + u16 src_port, + u16 dst_port, + ip6_hop_by_hop_option_t * opt0, + ip6_hop_by_hop_option_t * limit0, u16 len) +{ + u8 type0; + ip46_udp_ping_flow *ip46_flow; + u16 flow_index; + ioam_analyser_data_t *data; + ioam_e2e_option_t *e2e; + ioam_trace_option_t *trace; + + /* If the packet doesnt match UDP session then return */ + if (PREDICT_FALSE (pool_is_free_index (udp_ping_main.ip46_flow, flow_id))) + return; + + ip46_flow = udp_ping_main.ip46_flow + flow_id; + /* Check port is within range */ + if (PREDICT_FALSE ((src_port < ip46_flow->udp_data.start_src_port) || + (src_port > ip46_flow->udp_data.end_src_port) || + (dst_port < ip46_flow->udp_data.start_dst_port) || + (dst_port > ip46_flow->udp_data.end_dst_port))) + return; + + flow_index = (src_port - ip46_flow->udp_data.start_src_port) * + (ip46_flow->udp_data.end_dst_port - ip46_flow->udp_data.start_dst_port + + 1); + flow_index += (dst_port - ip46_flow->udp_data.start_dst_port); + data = &(ip46_flow->udp_data.stats[flow_index].analyse_data); + + data->pkt_counter++; + data->bytes_counter += len; + + vnet_buffer (b0)->l2_classify.opaque_index = + ip46_flow->udp_data.stats[flow_index].flow_ctx; + + while (opt0 < limit0) + { + type0 = opt0->type; + switch (type0) + { + case HBH_OPTION_TYPE_IOAM_TRACE_DATA_LIST: + /* Add trace for here as it hasnt been done yet */ + vnet_buffer (b0)->sw_if_index[VLIB_TX] = ~0; + trace = (ioam_trace_option_t *) opt0; + if (PREDICT_FALSE + (trace->trace_hdr.ioam_trace_type & BIT_LOOPBACK_REPLY)) + { + ip6_ioam_analyse_hbh_trace_loopback (data, &trace->trace_hdr, + (trace->hdr.length - 2)); + return; + } + ip6_hbh_ioam_trace_data_list_handler (b0, + vlib_buffer_get_current (b0), + opt0); + (void) ip6_ioam_analyse_hbh_trace (data, &trace->trace_hdr, len, + (trace->hdr.length - 2)); + break; + case HBH_OPTION_TYPE_IOAM_EDGE_TO_EDGE: + e2e = (ioam_e2e_option_t *) opt0; + (void) ip6_ioam_analyse_hbh_e2e (data, &e2e->e2e_hdr, len); + break; + case 0: /* Pad1 */ + opt0 = (ip6_hop_by_hop_option_t *) ((u8 *) opt0) + 1; + continue; + case 1: /* PadN */ + break; + default: + break; + } + opt0 = (ip6_hop_by_hop_option_t *) (((u8 *) opt0) + opt0->length + + sizeof (ip6_hop_by_hop_option_t)); + } + ip46_flow->udp_data.stats[flow_index].retry = 0; +} + +/** + * @brief UDP-Ping request/response handler function. + * + * Checks udp-ping packet type - request/response and handles them. + * If not udp-ping packet then, strips off hbh options and enques + * packet to protocol registered node to enable next protocol processing. + * + */ +void +udp_ping_local_analyse (vlib_buffer_t * b0, + ip6_header_t * ip0, + ip6_hop_by_hop_header_t * hbh0, u16 * next0) +{ + ip6_main_t *im = &ip6_main; + ip_lookup_main_t *lm = &im->lookup_main; + + *next0 = UDP_PING_NEXT_IP6_DROP; + + if (PREDICT_TRUE (hbh0->protocol == IP_PROTOCOL_UDP)) + { + ip6_hop_by_hop_option_t *opt0; + ip6_hop_by_hop_option_t *limit0; + u16 p_len0; + udp_ping_t *udp0; + + /* Check for udp ping packet */ + udp0 = (udp_ping_t *) ((u8 *) hbh0 + ((hbh0->length + 1) << 3)); + opt0 = (ip6_hop_by_hop_option_t *) (hbh0 + 1); + if ((udp0->ping_data.probe_marker1 == + clib_host_to_net_u32 (UDP_PING_PROBE_MARKER1)) && + (udp0->ping_data.probe_marker2 == + clib_host_to_net_u32 (UDP_PING_PROBE_MARKER2))) + { + if (udp0->ping_data.msg_type == UDP_PING_PROBE) + { + udp_ping_create_reply_from_probe_ip6 (ip0, hbh0, udp0); + /* Skip e2e processing */ + vnet_buffer (b0)->l2_classify.opaque_index = 0x7FFFFFFF; + *next0 = UDP_PING_NEXT_IP6_LOOKUP; + return; + } + + /* Reply */ + opt0 = (ip6_hop_by_hop_option_t *) (hbh0 + 1); + limit0 = (ip6_hop_by_hop_option_t *) + ((u8 *) hbh0 + ((hbh0->length + 1) << 3)); + p_len0 = clib_net_to_host_u16 (ip0->payload_length); + udp_ping_analyse_hbh (b0, + clib_net_to_host_u16 (udp0-> + ping_data.sender_handle), + clib_net_to_host_u16 (udp0->udp.dst_port), + clib_net_to_host_u16 (udp0->udp.src_port), + opt0, limit0, p_len0); + + /* UDP Ping packet, so return */ + return; + } + } + + /* If next header is SR, then destination may get overwritten to + * remote address. So pass it to SR processing as it may be local packet + * afterall + */ + if (PREDICT_FALSE (hbh0->protocol == IPPROTO_IPV6_ROUTE)) + goto end; + + /* Other case remove hbh-ioam headers */ + u64 *copy_dst0, *copy_src0; + u16 new_l0; + + vlib_buffer_advance (b0, (hbh0->length + 1) << 3); + + new_l0 = clib_net_to_host_u16 (ip0->payload_length) - + ((hbh0->length + 1) << 3); + + ip0->payload_length = clib_host_to_net_u16 (new_l0); + + ip0->protocol = hbh0->protocol; + + copy_src0 = (u64 *) ip0; + copy_dst0 = copy_src0 + (hbh0->length + 1); + copy_dst0[4] = copy_src0[4]; + copy_dst0[3] = copy_src0[3]; + copy_dst0[2] = copy_src0[2]; + copy_dst0[1] = copy_src0[1]; + copy_dst0[0] = copy_src0[0]; + +end: + *next0 = lm->local_next_by_ip_protocol[hbh0->protocol]; + return; +} + +/** + * @brief udp ping request/response packet receive node. + * @node udp-ping-local + * + * This function receives udp ping request/response packets and process them. + * For request packets, response is created and sent. + * For response packets, they are analysed and results stored. + * + * @param vm vlib_main_t corresponding to the current thread. + * @param node vlib_node_runtime_t data for this node. + * @param frame vlib_frame_t whose contents should be dispatched. + * + * @par Graph mechanics: buffer, next index usage + * + * <em>Uses:</em> + * - <code>udp_ping_local_analyse(p0, ip0, hbh0, &next0)</code> + * - Checks packet type - request/respnse and process them. + * + * <em>Next Index:</em> + * - Dispatches the packet to ip6-lookup/ip6-drop depending on type of packet. + */ +static uword +udp_ping_local_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + udp_ping_next_t next_index; + u32 *from, *to_next, n_left_from, n_left_to_next; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next >= 2) + { + vlib_buffer_t *p0, *p1; + ip6_header_t *ip0, *ip1; + ip6_hop_by_hop_header_t *hbh0, *hbh1; + u16 next0, next1; + u32 pi0, pi1; + + /* Prefetch next iteration. */ + { + vlib_buffer_t *p2, *p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + + /* Prefetch 3 cache lines as we need to look deep into packet */ + CLIB_PREFETCH (p2->data, 3 * CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (p3->data, 3 * CLIB_CACHE_LINE_BYTES, STORE); + } + + pi0 = to_next[0] = from[0]; + pi1 = to_next[1] = from[1]; + from += 2; + n_left_from -= 2; + to_next += 2; + n_left_to_next -= 2; + + p0 = vlib_get_buffer (vm, pi0); + p1 = vlib_get_buffer (vm, pi1); + + ip0 = vlib_buffer_get_current (p0); + ip1 = vlib_buffer_get_current (p1); + + hbh0 = (ip6_hop_by_hop_header_t *) (ip0 + 1); + hbh1 = (ip6_hop_by_hop_header_t *) (ip1 + 1); + + udp_ping_local_analyse (p0, ip0, hbh0, &next0); + udp_ping_local_analyse (p1, ip1, hbh1, &next1); + + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE))) + { + if (p0->flags & VLIB_BUFFER_IS_TRACED) + { + udp_ping_trace_t *t0 = + vlib_add_trace (vm, node, p0, sizeof (*t0)); + udp_ping_t *udp0; + + /* Check for udp ping packet */ + udp0 = + (udp_ping_t *) ((u8 *) hbh0 + ((hbh0->length + 1) << 3)); + t0->src = ip0->src_address; + t0->dst = ip0->dst_address; + t0->src_port = clib_net_to_host_u16 (udp0->udp.src_port); + t0->dst_port = clib_net_to_host_u16 (udp0->udp.dst_port); + t0->handle = + clib_net_to_host_u16 (udp0->ping_data.sender_handle); + t0->msg_type = udp0->ping_data.msg_type; + t0->next_index = next0; + } + if (p1->flags & VLIB_BUFFER_IS_TRACED) + { + udp_ping_trace_t *t1 = + vlib_add_trace (vm, node, p1, sizeof (*t1)); + udp_ping_t *udp1; + + /* Check for udp ping packet */ + udp1 = + (udp_ping_t *) ((u8 *) hbh1 + ((hbh1->length + 1) << 3)); + t1->src = ip1->src_address; + t1->dst = ip1->dst_address; + t1->src_port = clib_net_to_host_u16 (udp1->udp.src_port); + t1->dst_port = clib_net_to_host_u16 (udp1->udp.dst_port); + t1->handle = + clib_net_to_host_u16 (udp1->ping_data.sender_handle); + t1->msg_type = udp1->ping_data.msg_type; + t1->next_index = next1; + } + } + + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, + to_next, n_left_to_next, + pi0, pi1, next0, next1); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + vlib_buffer_t *p0; + ip6_header_t *ip0; + ip6_hop_by_hop_header_t *hbh0; + u16 next0; + u32 pi0; + + pi0 = from[0]; + to_next[0] = pi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + p0 = vlib_get_buffer (vm, pi0); + ip0 = vlib_buffer_get_current (p0); + hbh0 = (ip6_hop_by_hop_header_t *) (ip0 + 1); + + udp_ping_local_analyse (p0, ip0, hbh0, &next0); + + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE))) + { + if (p0->flags & VLIB_BUFFER_IS_TRACED) + { + udp_ping_trace_t *t0 = + vlib_add_trace (vm, node, p0, sizeof (*t0)); + udp_ping_t *udp0; + + /* Check for udp ping packet */ + udp0 = + (udp_ping_t *) ((u8 *) hbh0 + ((hbh0->length + 1) << 3)); + t0->src = ip0->src_address; + t0->dst = ip0->dst_address; + t0->src_port = clib_net_to_host_u16 (udp0->udp.src_port); + t0->dst_port = clib_net_to_host_u16 (udp0->udp.dst_port); + t0->handle = + clib_net_to_host_u16 (udp0->ping_data.sender_handle); + t0->msg_type = udp0->ping_data.msg_type; + t0->next_index = next0; + } + } + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + pi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + return frame->n_vectors; +} + +/* *INDENT-OFF* */ +/* + * Node for udp-ping-local + */ +VLIB_REGISTER_NODE (udp_ping_local, static) = +{ + .function = udp_ping_local_node_fn, + .name = "udp-ping-local", + .vector_size = sizeof (u32), + .format_trace = format_udp_ping_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_next_nodes = UDP_PING_N_NEXT, + .next_nodes = + { + [UDP_PING_NEXT_DROP] = "error-drop", + [UDP_PING_NEXT_PUNT] = "error-punt", + [UDP_PING_NEXT_UDP_LOOKUP] = "ip6-udp-lookup", + [UDP_PING_NEXT_ICMP] = "ip6-icmp-input", + [UDP_PING_NEXT_IP6_LOOKUP] = "ip6-lookup", + [UDP_PING_NEXT_IP6_DROP] = "ip6-drop", + }, +}; +/* *INDENT-ON* */ + +static clib_error_t * +udp_ping_init (vlib_main_t * vm) +{ + clib_error_t *error = 0; + + udp_ping_main.vlib_main = vm; + udp_ping_main.vnet_main = vnet_get_main (); + udp_ping_main.timer_interval = 1e9; + + if ((error = vlib_call_init_function (vm, ip_main_init))) + return (error); + + ip6_register_protocol (IP_PROTOCOL_IP6_HOP_BY_HOP_OPTIONS, + udp_ping_local.index); + return 0; +} + +VLIB_INIT_FUNCTION (udp_ping_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/udp-ping/udp_ping_packet.h b/src/plugins/ioam/udp-ping/udp_ping_packet.h new file mode 100644 index 00000000..09dcb1c2 --- /dev/null +++ b/src/plugins/ioam/udp-ping/udp_ping_packet.h @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PLUGINS_IOAM_PLUGIN_IOAM_UDP_PING_UDP_PING_PACKET_H_ +#define PLUGINS_IOAM_PLUGIN_IOAM_UDP_PING_UDP_PING_PACKET_H_ + +#include <vppinfra/clib.h> +#include <vnet/ip/ip6_hop_by_hop_packet.h> +#include <vnet/udp/udp_packet.h> + +#define UDP_PING_PROBE 1 +#define UDP_PING_REPLY 2 + +#define UDP_PING_PROBE_MARKER1 0xDEAD +#define UDP_PING_PROBE_MARKER2 0xBEEF + +/* + * Refer to: + * https://tools.ietf.org/html/draft-lapukhov-dataplane-probe-01 + * 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Probe Marker (1) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Probe Marker (2) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Version | Message Type | Flags | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Telemetry Request Vector | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Hop Limit | Hop Count | Must Be Zero | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Maximum Length | Current Length | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Sender's Handle | Sequence Number | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * (1) The "Probe Marker" fields are arbitrary 32-bit values generally + used by the network elements to identify the packet as a probe + packet. These fields should be interpreted as unsigned integer + values, stored in network byte order. For example, a network + element may be configured to recognize a UDP packet destined to + port 31337 and having 0xDEAD 0xBEEF as the values in "Probe + Marker" field as an active probe, and treat it respectively. + + (2) "Version Number" is currently set to 1. + + (3) The "Message Type" field value could be either "1" - "Probe" or + "2" - "Probe Reply" + + (4) The "Flags" field is 8 bits, and defines the following flags: + + (5) + (1) "Overflow" (O-bit) (least significant bit). This bit is + set by the network element if the number of records on the + packet is at the maximum limit as specified by the packet: + i.e. the packet is already "full" of telemetry + information. + + (6) "Telemetry Request Vector" is a 32-bit long field that requests + well-known inband telemetry information from the network + elements on the path. A bit set in this vector translates to a + request of a particular type of information. The following + types/bits are currently defined, starting with the least + significant bit first: + + (1) Bit 0: Device identifier. + + (2) Bit 1: Timestamp. + + (3) Bit 2: Queueing delay. + + (4) Bit 3: Ingress/Egress port identifiers. + + (5) Bit 31: Opaque state snapshot request. + + (7) "Hop Limit" is defined only for "Message Type" of "1" + ("Probe"). For "Probe Reply" the "Hop Limit" field must be set + to zero. This field is treated as an integer value + representing the number of network elements. See the Section 4 + section on the intended use of the field. + + (8) The "Hop Count" field specifies the current number of hops of + capable network elements the packet has transit through. It + begins with zero and must be incremented by one for every + network element that adds a telemetry record. Combined with a + push mechanism, this simplifies the work for the subsequent + network element and the packet receiver. The subsequent + network element just needs to parse the template and then + insert new record(s) immediately after the template. + + (9) The "Max Length" field specifies the maximum length of the + telemetry payload in bytes. Given that the sender knows the + minimum path MTU, the sender can set the maximum of payload + bytes allowed before exceeding the MTU. Thus, a simple + comparison between "Current Length" and "Max Length" allows to + decide whether or not data could be added. + + (10) The "Current Length" field specifies the current length of data + stored in the probe. This field is incremented by eacn network + element by the number of bytes it has added with the telemetry + data frame. + + (11) The "Sender's Handle" field is set by the sender to allow the + receiver to identify a particular originator of probe packets. + Along with "Sequence Number" it allows for tracking of packet + order and loss within the network. + + * + */ +typedef struct +{ + u32 probe_marker1; + u32 probe_marker2; + u8 version; + u8 msg_type; + u16 flags; + u32 tel_req_vec; + u8 hop_limit; + u8 hop_count; + u16 reserve; + u16 max_len; + u16 cur_len; + u16 sender_handle; + u16 seq_no; +} udp_ping_data; + +typedef struct +{ + udp_header_t udp; + udp_ping_data ping_data; +} udp_ping_t; + +#endif /* PLUGINS_IOAM_PLUGIN_IOAM_UDP_PING_UDP_PING_PACKET_H_ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/udp-ping/udp_ping_test.c b/src/plugins/ioam/udp-ping/udp_ping_test.c new file mode 100644 index 00000000..4ec11351 --- /dev/null +++ b/src/plugins/ioam/udp-ping/udp_ping_test.c @@ -0,0 +1,269 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + *------------------------------------------------------------------ + * udp_ping_test.c - test harness for udp ping plugin + *------------------------------------------------------------------ + */ + +#include <vat/vat.h> +#include <vlibapi/api.h> +#include <vlibmemory/api.h> +#include <vlibsocket/api.h> +#include <vppinfra/error.h> +#include <vnet/ip/ip.h> + +/* Declare message IDs */ +#include <ioam/udp-ping/udp_ping_msg_enum.h> + +/* define message structures */ +#define vl_typedefs +#include <ioam/udp-ping/udp_ping_all_api_h.h> +#undef vl_typedefs + +/* declare message handlers for each api */ + +#define vl_endianfun /* define message structures */ +#include <ioam/udp-ping/udp_ping_all_api_h.h> +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) +#define vl_printfun +#include <ioam/udp-ping/udp_ping_all_api_h.h> +#undef vl_printfun + +/* Get the API version number. */ +#define vl_api_version(n,v) static u32 api_version=(v); +#include <ioam/udp-ping/udp_ping_all_api_h.h> +#undef vl_api_version + + +typedef struct +{ + /* API message ID base */ + u16 msg_id_base; + vat_main_t *vat_main; +} udp_ping_test_main_t; + +udp_ping_test_main_t udp_ping_test_main; + +#define foreach_standard_reply_retval_handler \ +_(udp_ping_add_del_reply) \ +_(udp_ping_export_reply) + +#define _(n) \ + static void vl_api_##n##_t_handler \ + (vl_api_##n##_t * mp) \ + { \ + vat_main_t * vam = udp_ping_test_main.vat_main; \ + i32 retval = ntohl(mp->retval); \ + if (vam->async_mode) { \ + vam->async_errors += (retval < 0); \ + } else { \ + vam->retval = retval; \ + vam->result_ready = 1; \ + } \ + } +foreach_standard_reply_retval_handler; +#undef _ + +/* + * Table of message reply handlers, must include boilerplate handlers + * we just generated + */ +#define foreach_vpe_api_reply_msg \ +_(UDP_PING_ADD_DEL_REPLY, udp_ping_add_del_reply) \ +_(UDP_PING_EXPORT_REPLY, udp_ping_export_reply) \ + + +/* M: construct, but don't yet send a message */ + +#define M(T,t) \ +do { \ + vam->result_ready = 0; \ + mp = vl_msg_api_alloc(sizeof(*mp)); \ + memset (mp, 0, sizeof (*mp)); \ + mp->_vl_msg_id = ntohs (VL_API_##T + sm->msg_id_base); \ + mp->client_index = vam->my_client_index; \ +} while(0); + +/* S: send a message */ +#define S (vl_msg_api_send_shmem (vam->vl_input_queue, (u8 *)&mp)) + +/* W: wait for results, with timeout */ +#define W \ +do { \ + timeout = vat_time_now (vam) + 5.0; \ + \ + while (vat_time_now (vam) < timeout) { \ + if (vam->result_ready == 1) { \ + return (vam->retval); \ + } \ + } \ + return -99; \ +} while(0); + +static int +api_udp_ping_add_del_req (vat_main_t * vam) +{ + udp_ping_test_main_t *sm = &udp_ping_test_main; + unformat_input_t *input = vam->input; + vl_api_udp_ping_add_del_req_t *mp; + int rv = 0; + ip6_address_t dst, src; + u32 start_src_port, end_src_port; + u32 start_dst_port, end_dst_port; + u32 interval; + u8 is_disable = 0; + f64 timeout; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "src %U", unformat_ip6_address, &src)) + ; + else if (unformat (input, "start-src-port %d", &start_src_port)) + ; + else if (unformat (input, "end-src-port %d", &end_src_port)) + ; + else if (unformat (input, "start-dst-port %d", &start_dst_port)) + ; + else if (unformat (input, "end-dst-port %d", &end_dst_port)) + ; + else if (unformat (input, "dst %U", unformat_ip6_address, &dst)) + ; + else if (unformat (input, "interval %d", &interval)) + ; + else if (unformat (input, "disable")) + is_disable = 1; + else + break; + } + + M (UDP_PING_ADD_DEL_REQ, udp_ping_add); + + clib_memcpy (mp->src_ip_address, &src, 16); + clib_memcpy (mp->dst_ip_address, &dst, 16); + mp->start_src_port = (u16) start_src_port; + mp->end_src_port = (u16) end_src_port; + mp->start_dst_port = (u16) start_dst_port; + mp->end_dst_port = (u16) end_dst_port; + mp->interval = (u16) interval; + mp->is_ipv4 = 0; + mp->dis = is_disable; + + S; + W; + + return (rv); +} + +static int +api_udp_ping_export_req (vat_main_t * vam) +{ + udp_ping_test_main_t *sm = &udp_ping_test_main; + unformat_input_t *input = vam->input; + vl_api_udp_ping_export_req_t *mp; + int rv = 0; + int is_add = 1; + f64 timeout; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "export")) + is_add = 1; + else if (unformat (input, "disable")) + is_add = 0; + else + break; + } + + M (UDP_PING_EXPORT_REQ, udp_ping_export); + + mp->enable = is_add; + + S; + W; + + return (rv); +} + +/* + * List of messages that the api test plugin sends, + * and that the data plane plugin processes + */ +#define foreach_vpe_api_msg \ +_(udp_ping_add_del_req, "src <local IPv6 address> start-src-port <first local port> "\ + "end-src-port <last local port> " \ + "dst <remote IPv6 address> start-dst-port <first destination port> "\ + "end-dst-port <last destination port> "\ + "interval <time interval in sec for which ping packet will be sent> "\ + "[disable]") \ +_(udp_ping_export_req, "export [disable]") \ + + +static void +udp_ping_test_api_hookup (vat_main_t * vam) +{ + udp_ping_test_main_t *sm = &udp_ping_test_main; + /* Hook up handlers for replies from the data plane plug-in */ +#define _(N,n) \ + vl_msg_api_set_handlers((VL_API_##N + sm->msg_id_base), \ + #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_vpe_api_reply_msg; +#undef _ + + /* API messages we can send */ +#define _(n,h) hash_set_mem (vam->function_by_name, #n, api_##n); + foreach_vpe_api_msg; +#undef _ + + /* Help strings */ +#define _(n,h) hash_set_mem (vam->help_by_name, #n, h); + foreach_vpe_api_msg; +#undef _ +} + +clib_error_t * +vat_plugin_register (vat_main_t * vam) +{ + udp_ping_test_main_t *sm = &udp_ping_test_main; + u8 *name; + + sm->vat_main = vam; + + name = format (0, "udp_ping_%08x%c", api_version, 0); + sm->msg_id_base = vl_client_get_first_plugin_msg_id ((char *) name); + + if (sm->msg_id_base != (u16) ~ 0) + udp_ping_test_api_hookup (vam); + + vec_free (name); + + return 0; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/udp-ping/udp_ping_util.c b/src/plugins/ioam/udp-ping/udp_ping_util.c new file mode 100644 index 00000000..55f48ea4 --- /dev/null +++ b/src/plugins/ioam/udp-ping/udp_ping_util.c @@ -0,0 +1,330 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vnet/vnet.h> +#include <vnet/ip/ip.h> +#include <vnet/ip/ip6_hop_by_hop.h> +#include <ioam/encap/ip6_ioam_e2e.h> +#include <ioam/encap/ip6_ioam_trace.h> +#include <ioam/udp-ping/udp_ping_packet.h> +#include <ioam/udp-ping/udp_ping.h> + +#define UDP_PING_REWRITE_LEN 1000 + +u16 +udp_ping_fill_udp_data (udp_ping_t * udp_ping, + u16 src_port, u16 dst_port, u8 msg_type, u16 ctx) +{ + /* Populate udp ping header */ + udp_ping->udp.src_port = clib_host_to_net_u16 (src_port); + udp_ping->udp.dst_port = clib_host_to_net_u16 (dst_port); + udp_ping->udp.length = clib_host_to_net_u16 (sizeof (udp_ping_t)); + udp_ping->udp.checksum = 0; + udp_ping->ping_data.probe_marker1 = + clib_host_to_net_u32 (UDP_PING_PROBE_MARKER1); + udp_ping->ping_data.probe_marker2 = + clib_host_to_net_u32 (UDP_PING_PROBE_MARKER2); + udp_ping->ping_data.version = 1; + udp_ping->ping_data.msg_type = msg_type; + udp_ping->ping_data.flags = clib_host_to_net_u16 (0); + udp_ping->ping_data.tel_req_vec = clib_host_to_net_u16 (0); + udp_ping->ping_data.hop_limit = 254; + udp_ping->ping_data.hop_count = 0; + udp_ping->ping_data.reserve = clib_host_to_net_u16 (0); + udp_ping->ping_data.max_len = + udp_ping->ping_data.cur_len = clib_host_to_net_u16 (0); + udp_ping->ping_data.sender_handle = clib_host_to_net_u16 (ctx); + udp_ping->ping_data.seq_no = clib_host_to_net_u16 (0); + + return (sizeof (udp_ping_t)); +} + +/** + * @brief Frame IPv6 udp-ping probe packet. + * + * Creates IPv6 UDP-Ping probe packet along with iOAM headers. + * + */ +int +udp_ping_create_ip6_pak (u8 * buf, /*u16 len, */ + ip6_address_t src, ip6_address_t dst, + u16 src_port, u16 dst_port, u8 msg_type, u16 ctx) +{ + ip6_header_t *ip0; + ip6_hop_by_hop_header_t *hbh0; + //trace_profile *profile = NULL; + u16 hbh_len = 0, rnd_size = 0, ip0_len = 0, udp_len = 0; + u16 trace_len = 0, trace_data_size = 0; + u16 e2e_len = sizeof (ioam_e2e_option_t) - sizeof (ip6_hop_by_hop_option_t); + u8 *current = NULL; + ioam_trace_option_t *trace_option; + ioam_e2e_option_t *e2e; + + ip0 = (ip6_header_t *) buf; + + ip0->ip_version_traffic_class_and_flow_label = + clib_host_to_net_u32 (0x6 << 28); + + ip0->protocol = IP_PROTOCOL_IP6_HOP_BY_HOP_OPTIONS; + ip0->hop_limit = 255; + + ip0->src_address = src; + ip0->dst_address = dst; + + hbh0 = (ip6_hop_by_hop_header_t *) (ip0 + 1); + + /* Calculate hbh header len */ + //profile = trace_profile_find(); + trace_data_size = fetch_trace_data_size (TRACE_TYPE_IF_TS_APP); + /* We need 2 times data for trace as packet traverse back to source */ + trace_len = sizeof (ioam_trace_option_t) + + (5 * trace_data_size * 2) - sizeof (ip6_hop_by_hop_option_t); + //(profile->num_elts * trace_data_size * 2); + hbh_len = e2e_len + trace_len + sizeof (ip6_hop_by_hop_header_t); + rnd_size = (hbh_len + 7) & ~7; + + /* Length of header in 8 octet units, not incl first 8 octets */ + hbh0->length = (rnd_size >> 3) - 1; + hbh0->protocol = IP_PROTOCOL_UDP; + + /* Populate hbh header */ + current = (u8 *) (hbh0 + 1); + + /* Populate trace */ + trace_option = (ioam_trace_option_t *) current; + trace_option->hdr.type = HBH_OPTION_TYPE_IOAM_TRACE_DATA_LIST | + HBH_OPTION_TYPE_DATA_CHANGE_ENROUTE; + trace_option->hdr.length = trace_len; + trace_option->trace_hdr.ioam_trace_type = + TRACE_TYPE_IF_TS_APP & TRACE_TYPE_MASK; + + trace_option->trace_hdr.data_list_elts_left = 5 * 2; + //profile->num_elts * 2; + + current += trace_option->hdr.length + sizeof (ip6_hop_by_hop_option_t); + + /* Populate e2e */ + e2e = (ioam_e2e_option_t *) current; + e2e->hdr.type = HBH_OPTION_TYPE_IOAM_EDGE_TO_EDGE; + e2e->hdr.length = e2e_len; + + /* Move past hbh header */ + current = ((u8 *) hbh0) + ((hbh0->length + 1) << 3); + + /* Populate udp ping header */ + udp_len = udp_ping_fill_udp_data ((udp_ping_t *) current, + src_port, dst_port, msg_type, ctx); + + /* Calculate total length and set it in ip6 header */ + ip0_len = ((hbh0->length + 1) << 3) + udp_len; + //ip0_len = (len > ip0_len) ? len : ip0_len; + ip0->payload_length = clib_host_to_net_u16 (ip0_len); + + return (ip0_len + sizeof (ip6_header_t)); +} + +int +udp_ping_compare_flow (ip46_address_t src, ip46_address_t dst, + u16 start_src_port, u16 end_src_port, + u16 start_dst_port, u16 end_dst_port, + ip46_udp_ping_flow * flow) +{ + if ((0 == ip46_address_cmp (&flow->src, &src)) && + (0 == ip46_address_cmp (&flow->dst, &dst)) && + (flow->udp_data.start_src_port == start_src_port) && + (flow->udp_data.end_src_port == end_src_port) && + (flow->udp_data.start_dst_port == start_dst_port) && + (flow->udp_data.end_dst_port == end_dst_port)) + { + return 0; + } + + return -1; +} + +void +udp_ping_populate_flow (ip46_address_t src, ip46_address_t dst, + u16 start_src_port, u16 end_src_port, + u16 start_dst_port, u16 end_dst_port, + u16 interval, u8 fault_det, ip46_udp_ping_flow * flow) +{ + flow->src = src; + flow->dst = dst; + flow->udp_data.start_src_port = start_src_port; + flow->udp_data.end_src_port = end_src_port; + flow->udp_data.start_dst_port = start_dst_port; + flow->udp_data.end_dst_port = end_dst_port; + flow->udp_data.interval = interval; + flow->udp_data.next_send_time = 0; + flow->fault_det = fault_det; +} + +void +udp_ping_create_rewrite (ip46_udp_ping_flow * flow, u16 ctx) +{ + u16 src_port; + u16 dst_port; + u16 no_flows; + int i; + udp_ping_flow_data *stats; + + no_flows = + (flow->udp_data.end_dst_port - flow->udp_data.start_dst_port) + 1; + no_flows *= + ((flow->udp_data.end_src_port - flow->udp_data.start_src_port) + 1); + + vec_validate_aligned (flow->udp_data.stats, + no_flows - 1, CLIB_CACHE_LINE_BYTES); + + i = 0; + for (src_port = flow->udp_data.start_src_port; + src_port <= flow->udp_data.end_src_port; src_port++) + { + for (dst_port = flow->udp_data.start_dst_port; + dst_port <= flow->udp_data.end_dst_port; dst_port++) + { + u8 *rewrite = NULL; + + stats = flow->udp_data.stats + i; + ioam_analyse_init_data (&stats->analyse_data); + stats->analyse_data.is_free = 0; + + vec_validate (rewrite, UDP_PING_REWRITE_LEN - 1); + stats->ping_rewrite = rewrite; + stats->rewrite_len = + udp_ping_create_ip6_pak (rewrite, + flow->src.ip6, flow->dst.ip6, + src_port, dst_port, UDP_PING_PROBE, ctx); + /* For each flow we need to create ioam e2e flow */ + stats->flow_ctx = ioam_flow_add (1, (u8 *) "udp_ping"); //FIXME + i++; + } + } +} + +void +udp_ping_free_flow_data (ip46_udp_ping_flow * flow) +{ + int i; + udp_ping_flow_data *stats; + + for (i = 0; i < vec_len (flow->udp_data.stats); i++) + { + stats = flow->udp_data.stats + i; + vec_free (stats->ping_rewrite); + stats->ping_rewrite = NULL; + stats->rewrite_len = 0; + } + + vec_free (flow->udp_data.stats); + flow->udp_data.stats = NULL; +} + +/** + * @brief Create and send ipv6 udp-ping probe packet. + * + */ +void +udp_ping_send_ip6_pak (vlib_main_t * vm, ip46_udp_ping_flow * flow) +{ + u16 no_pak; + u32 *buffers = NULL; + int i; + vlib_buffer_t *b0; + udp_ping_flow_data *stats; + vlib_frame_t *nf = 0; + u32 *to_next; + vlib_node_t *next_node; + + next_node = vlib_get_node_by_name (vm, (u8 *) "ip6-lookup"); + nf = vlib_get_frame_to_node (vm, next_node->index); + nf->n_vectors = 0; + to_next = vlib_frame_vector_args (nf); + + no_pak = vec_len (flow->udp_data.stats); + vec_validate (buffers, (no_pak - 1)); + if (vlib_buffer_alloc (vm, buffers, vec_len (buffers)) != no_pak) + { + //Error + return; + } + + for (i = 0; i < no_pak; i++) + { + int bogus; + b0 = vlib_get_buffer (vm, buffers[i]); + stats = flow->udp_data.stats + i; + clib_memcpy (b0->data, stats->ping_rewrite, stats->rewrite_len); + b0->current_data = 0; + b0->current_length = stats->rewrite_len; + b0->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID; + + /* If session is going down, then set path down */ + if ((stats->retry != 0) && ((stats->retry % MAX_PING_RETRIES) == 0)) + ip6_ioam_analyse_set_paths_down (&stats->analyse_data); + + stats->retry++; + stats->analyse_data.pkt_sent++; + vnet_buffer (b0)->sw_if_index[VLIB_RX] = 0; + vnet_buffer (b0)->sw_if_index[VLIB_TX] = ~0; + vnet_buffer (b0)->l2_classify.opaque_index = stats->flow_ctx; + + ip6_header_t *ip6 = vlib_buffer_get_current (b0); + ip6_hop_by_hop_header_t *hbh = (ip6_hop_by_hop_header_t *) (ip6 + 1); + udp_header_t *udp = + (udp_header_t *) ((u8 *) hbh + ((hbh->length + 1) << 3)); + + /* If session is down, then set loopback flag in probe. + * This is for fault isolation. + */ + if (flow->fault_det && (stats->retry > MAX_PING_RETRIES)) + { + ioam_trace_option_t *opt = (ioam_trace_option_t *) + ip6_hbh_get_option (hbh, HBH_OPTION_TYPE_IOAM_TRACE_DATA_LIST); + ip6_hbh_ioam_trace_set_bit (opt, BIT_LOOPBACK); + } + + /* Checksum not pre-computed as we intend to vary packet length for every + * probe. its isnt done yet, but to be taken up later. + */ + udp->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b0, ip6, &bogus); + ASSERT (bogus == 0); + if (udp->checksum == 0) + udp->checksum = 0xffff; + + if (nf->n_vectors == VLIB_FRAME_SIZE) + { + vlib_put_frame_to_node (vm, next_node->index, nf); + nf = vlib_get_frame_to_node (vm, next_node->index); + nf->n_vectors = 0; + to_next = vlib_frame_vector_args (nf); + } + *to_next = buffers[i]; + nf->n_vectors++; + to_next++; + } + vlib_put_frame_to_node (vm, next_node->index, nf); + + flow->udp_data.next_send_time = + vlib_time_now (vm) + flow->udp_data.interval; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ioam/udp-ping/udp_ping_util.h b/src/plugins/ioam/udp-ping/udp_ping_util.h new file mode 100644 index 00000000..fcaf27bd --- /dev/null +++ b/src/plugins/ioam/udp-ping/udp_ping_util.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PLUGINS_IOAM_PLUGIN_IOAM_UDP_PING_UDP_PING_UTIL_H_ +#define PLUGINS_IOAM_PLUGIN_IOAM_UDP_PING_UDP_PING_UTIL_H_ + +int udp_ping_create_ip6_pak (u8 * buf, /*u16 len, */ + ip6_address_t src, ip6_address_t dst, + u16 src_port, u16 dst_port, + u8 msg_type, u16 ctx); + +int +udp_ping_compare_flow (ip46_address_t src, ip46_address_t dst, + u16 start_src_port, u16 end_src_port, + u16 start_dst_port, u16 end_dst_port, + ip46_udp_ping_flow * flow); + +void +udp_ping_populate_flow (ip46_address_t src, ip46_address_t dst, + u16 start_src_port, u16 end_src_port, + u16 start_dst_port, u16 end_dst_port, + u16 interval, u8 fault_det, + ip46_udp_ping_flow * flow); + +void udp_ping_free_flow_data (ip46_udp_ping_flow * flow); + +void udp_ping_create_rewrite (ip46_udp_ping_flow * flow, u16 ctx); + +void udp_ping_send_ip6_pak (vlib_main_t * vm, ip46_udp_ping_flow * flow); + +/** + * @brief Create and send ipv6 udp-ping response packet. + * + */ +always_inline void +udp_ping_create_reply_from_probe_ip6 (ip6_header_t * ip, + ip6_hop_by_hop_header_t * hbh, + udp_ping_t * udp) +{ + ip6_address_t src; + u16 src_port; + ioam_trace_option_t *trace; + + src = ip->src_address; + + ip->src_address = ip->dst_address; + ip->dst_address = src; + + trace = (ioam_trace_option_t *) + ip6_hbh_get_option (hbh, HBH_OPTION_TYPE_IOAM_TRACE_DATA_LIST); + ip6_hbh_ioam_trace_reset_bit (trace, BIT_LOOPBACK); + + /* No need of endian transform */ + src_port = udp->udp.src_port; + + udp->udp.src_port = udp->udp.dst_port; + udp->udp.dst_port = src_port; + udp->udp.checksum = 0; //FIXME + + udp->ping_data.msg_type = UDP_PING_REPLY; +} + +#endif /* PLUGINS_IOAM_PLUGIN_IOAM_UDP_PING_UDP_PING_UTIL_H_ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ixge.am b/src/plugins/ixge.am new file mode 100644 index 00000000..7e61344b --- /dev/null +++ b/src/plugins/ixge.am @@ -0,0 +1,20 @@ +# Copyright (c) 2016 Cisco Systems, Inc. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +vppplugins_LTLIBRARIES += ixge_plugin.la + +ixge_plugin_la_SOURCES = ixge/ixge.c + +noinst_HEADERS += ixge/ixge.h + +# vi:syntax=automake diff --git a/src/plugins/ixge/ixge.c b/src/plugins/ixge/ixge.c new file mode 100644 index 00000000..222c148c --- /dev/null +++ b/src/plugins/ixge/ixge.c @@ -0,0 +1,2958 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * WARNING! + * This driver is not intended for production use and it is unsupported. + * It is provided for educational use only. + * Please use supported DPDK driver instead. + */ + +#if __x86_64__ || __i386__ +#include <vppinfra/vector.h> + +#ifndef CLIB_HAVE_VEC128 +#warning HACK: ixge driver wont really work, missing u32x4 +typedef unsigned long long u32x4; +#endif + +#include <vlib/vlib.h> +#include <vlib/unix/unix.h> +#include <vlib/pci/pci.h> +#include <vnet/vnet.h> +#include <ixge/ixge.h> +#include <vnet/ethernet/ethernet.h> +#include <vnet/plugin/plugin.h> +#include <vpp/app/version.h> + +#define IXGE_ALWAYS_POLL 0 + +#define EVENT_SET_FLAGS 0 +#define IXGE_HWBP_RACE_ELOG 0 + +#define PCI_VENDOR_ID_INTEL 0x8086 + +/* 10 GIG E (XGE) PHY IEEE 802.3 clause 45 definitions. */ +#define XGE_PHY_DEV_TYPE_PMA_PMD 1 +#define XGE_PHY_DEV_TYPE_PHY_XS 4 +#define XGE_PHY_ID1 0x2 +#define XGE_PHY_ID2 0x3 +#define XGE_PHY_CONTROL 0x0 +#define XGE_PHY_CONTROL_RESET (1 << 15) + +ixge_main_t ixge_main; +static vlib_node_registration_t ixge_input_node; +static vlib_node_registration_t ixge_process_node; + +static void +ixge_semaphore_get (ixge_device_t * xd) +{ + ixge_main_t *xm = &ixge_main; + vlib_main_t *vm = xm->vlib_main; + ixge_regs_t *r = xd->regs; + u32 i; + + i = 0; + while (!(r->software_semaphore & (1 << 0))) + { + if (i > 0) + vlib_process_suspend (vm, 100e-6); + i++; + } + do + { + r->software_semaphore |= 1 << 1; + } + while (!(r->software_semaphore & (1 << 1))); +} + +static void +ixge_semaphore_release (ixge_device_t * xd) +{ + ixge_regs_t *r = xd->regs; + r->software_semaphore &= ~3; +} + +static void +ixge_software_firmware_sync (ixge_device_t * xd, u32 sw_mask) +{ + ixge_main_t *xm = &ixge_main; + vlib_main_t *vm = xm->vlib_main; + ixge_regs_t *r = xd->regs; + u32 fw_mask = sw_mask << 5; + u32 m, done = 0; + + while (!done) + { + ixge_semaphore_get (xd); + m = r->software_firmware_sync; + done = (m & fw_mask) == 0; + if (done) + r->software_firmware_sync = m | sw_mask; + ixge_semaphore_release (xd); + if (!done) + vlib_process_suspend (vm, 10e-3); + } +} + +static void +ixge_software_firmware_sync_release (ixge_device_t * xd, u32 sw_mask) +{ + ixge_regs_t *r = xd->regs; + ixge_semaphore_get (xd); + r->software_firmware_sync &= ~sw_mask; + ixge_semaphore_release (xd); +} + +u32 +ixge_read_write_phy_reg (ixge_device_t * xd, u32 dev_type, u32 reg_index, + u32 v, u32 is_read) +{ + ixge_regs_t *r = xd->regs; + const u32 busy_bit = 1 << 30; + u32 x; + + ASSERT (xd->phy_index < 2); + ixge_software_firmware_sync (xd, 1 << (1 + xd->phy_index)); + + ASSERT (reg_index < (1 << 16)); + ASSERT (dev_type < (1 << 5)); + if (!is_read) + r->xge_mac.phy_data = v; + + /* Address cycle. */ + x = + reg_index | (dev_type << 16) | (xd-> + phys[xd->phy_index].mdio_address << 21); + r->xge_mac.phy_command = x | busy_bit; + /* Busy wait timed to take 28e-6 secs. No suspend. */ + while (r->xge_mac.phy_command & busy_bit) + ; + + r->xge_mac.phy_command = x | ((is_read ? 2 : 1) << 26) | busy_bit; + while (r->xge_mac.phy_command & busy_bit) + ; + + if (is_read) + v = r->xge_mac.phy_data >> 16; + + ixge_software_firmware_sync_release (xd, 1 << (1 + xd->phy_index)); + + return v; +} + +static u32 +ixge_read_phy_reg (ixge_device_t * xd, u32 dev_type, u32 reg_index) +{ + return ixge_read_write_phy_reg (xd, dev_type, reg_index, 0, /* is_read */ + 1); +} + +static void +ixge_write_phy_reg (ixge_device_t * xd, u32 dev_type, u32 reg_index, u32 v) +{ + (void) ixge_read_write_phy_reg (xd, dev_type, reg_index, v, /* is_read */ + 0); +} + +static void +ixge_i2c_put_bits (i2c_bus_t * b, int scl, int sda) +{ + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd = vec_elt_at_index (xm->devices, b->private_data); + u32 v; + + v = 0; + v |= (sda != 0) << 3; + v |= (scl != 0) << 1; + xd->regs->i2c_control = v; +} + +static void +ixge_i2c_get_bits (i2c_bus_t * b, int *scl, int *sda) +{ + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd = vec_elt_at_index (xm->devices, b->private_data); + u32 v; + + v = xd->regs->i2c_control; + *sda = (v & (1 << 2)) != 0; + *scl = (v & (1 << 0)) != 0; +} + +static u16 +ixge_read_eeprom (ixge_device_t * xd, u32 address) +{ + ixge_regs_t *r = xd->regs; + u32 v; + r->eeprom_read = (( /* start bit */ (1 << 0)) | (address << 2)); + /* Wait for done bit. */ + while (!((v = r->eeprom_read) & (1 << 1))) + ; + return v >> 16; +} + +static void +ixge_sfp_enable_disable_laser (ixge_device_t * xd, uword enable) +{ + u32 tx_disable_bit = 1 << 3; + if (enable) + xd->regs->sdp_control &= ~tx_disable_bit; + else + xd->regs->sdp_control |= tx_disable_bit; +} + +static void +ixge_sfp_enable_disable_10g (ixge_device_t * xd, uword enable) +{ + u32 is_10g_bit = 1 << 5; + if (enable) + xd->regs->sdp_control |= is_10g_bit; + else + xd->regs->sdp_control &= ~is_10g_bit; +} + +static clib_error_t * +ixge_sfp_phy_init_from_eeprom (ixge_device_t * xd, u16 sfp_type) +{ + u16 a, id, reg_values_addr = 0; + + a = ixge_read_eeprom (xd, 0x2b); + if (a == 0 || a == 0xffff) + return clib_error_create ("no init sequence in eeprom"); + + while (1) + { + id = ixge_read_eeprom (xd, ++a); + if (id == 0xffff) + break; + reg_values_addr = ixge_read_eeprom (xd, ++a); + if (id == sfp_type) + break; + } + if (id != sfp_type) + return clib_error_create ("failed to find id 0x%x", sfp_type); + + ixge_software_firmware_sync (xd, 1 << 3); + while (1) + { + u16 v = ixge_read_eeprom (xd, ++reg_values_addr); + if (v == 0xffff) + break; + xd->regs->core_analog_config = v; + } + ixge_software_firmware_sync_release (xd, 1 << 3); + + /* Make sure laser is off. We'll turn on the laser when + the interface is brought up. */ + ixge_sfp_enable_disable_laser (xd, /* enable */ 0); + ixge_sfp_enable_disable_10g (xd, /* is_10g */ 1); + + return 0; +} + +static void +ixge_sfp_device_up_down (ixge_device_t * xd, uword is_up) +{ + u32 v; + + if (is_up) + { + /* pma/pmd 10g serial SFI. */ + xd->regs->xge_mac.auto_negotiation_control2 &= ~(3 << 16); + xd->regs->xge_mac.auto_negotiation_control2 |= 2 << 16; + + v = xd->regs->xge_mac.auto_negotiation_control; + v &= ~(7 << 13); + v |= (0 << 13); + /* Restart autoneg. */ + v |= (1 << 12); + xd->regs->xge_mac.auto_negotiation_control = v; + + while (!(xd->regs->xge_mac.link_partner_ability[0] & 0xf0000)) + ; + + v = xd->regs->xge_mac.auto_negotiation_control; + + /* link mode 10g sfi serdes */ + v &= ~(7 << 13); + v |= (3 << 13); + + /* Restart autoneg. */ + v |= (1 << 12); + xd->regs->xge_mac.auto_negotiation_control = v; + + xd->regs->xge_mac.link_status; + } + + ixge_sfp_enable_disable_laser (xd, /* enable */ is_up); + + /* Give time for link partner to notice that we're up. */ + if (is_up && vlib_in_process_context (vlib_get_main ())) + { + vlib_process_suspend (vlib_get_main (), 300e-3); + } +} + +always_inline ixge_dma_regs_t * +get_dma_regs (ixge_device_t * xd, vlib_rx_or_tx_t rt, u32 qi) +{ + ixge_regs_t *r = xd->regs; + ASSERT (qi < 128); + if (rt == VLIB_RX) + return qi < 64 ? &r->rx_dma0[qi] : &r->rx_dma1[qi - 64]; + else + return &r->tx_dma[qi]; +} + +static clib_error_t * +ixge_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags) +{ + vnet_hw_interface_t *hif = vnet_get_hw_interface (vnm, hw_if_index); + uword is_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0; + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd = vec_elt_at_index (xm->devices, hif->dev_instance); + ixge_dma_regs_t *dr = get_dma_regs (xd, VLIB_RX, 0); + + if (is_up) + { + xd->regs->rx_enable |= 1; + xd->regs->tx_dma_control |= 1; + dr->control |= 1 << 25; + while (!(dr->control & (1 << 25))) + ; + } + else + { + xd->regs->rx_enable &= ~1; + xd->regs->tx_dma_control &= ~1; + } + + ixge_sfp_device_up_down (xd, is_up); + + return /* no error */ 0; +} + +static void +ixge_sfp_phy_init (ixge_device_t * xd) +{ + ixge_phy_t *phy = xd->phys + xd->phy_index; + i2c_bus_t *ib = &xd->i2c_bus; + + ib->private_data = xd->device_index; + ib->put_bits = ixge_i2c_put_bits; + ib->get_bits = ixge_i2c_get_bits; + vlib_i2c_init (ib); + + vlib_i2c_read_eeprom (ib, 0x50, 0, 128, (u8 *) & xd->sfp_eeprom); + + if (vlib_i2c_bus_timed_out (ib) || !sfp_eeprom_is_valid (&xd->sfp_eeprom)) + xd->sfp_eeprom.id = SFP_ID_unknown; + else + { + /* FIXME 5 => SR/LR eeprom ID. */ + clib_error_t *e = + ixge_sfp_phy_init_from_eeprom (xd, 5 + xd->pci_function); + if (e) + clib_error_report (e); + } + + phy->mdio_address = ~0; +} + +static void +ixge_phy_init (ixge_device_t * xd) +{ + ixge_main_t *xm = &ixge_main; + vlib_main_t *vm = xm->vlib_main; + ixge_phy_t *phy = xd->phys + xd->phy_index; + + switch (xd->device_id) + { + case IXGE_82599_sfp: + case IXGE_82599_sfp_em: + case IXGE_82599_sfp_fcoe: + /* others? */ + return ixge_sfp_phy_init (xd); + + default: + break; + } + + /* Probe address of phy. */ + { + u32 i, v; + + phy->mdio_address = ~0; + for (i = 0; i < 32; i++) + { + phy->mdio_address = i; + v = ixge_read_phy_reg (xd, XGE_PHY_DEV_TYPE_PMA_PMD, XGE_PHY_ID1); + if (v != 0xffff && v != 0) + break; + } + + /* No PHY found? */ + if (i >= 32) + return; + } + + phy->id = + ((ixge_read_phy_reg (xd, XGE_PHY_DEV_TYPE_PMA_PMD, XGE_PHY_ID1) << 16) | + ixge_read_phy_reg (xd, XGE_PHY_DEV_TYPE_PMA_PMD, XGE_PHY_ID2)); + + { + ELOG_TYPE_DECLARE (e) = + { + .function = (char *) __FUNCTION__,.format = + "ixge %d, phy id 0x%d mdio address %d",.format_args = "i4i4i4",}; + struct + { + u32 instance, id, address; + } *ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->instance = xd->device_index; + ed->id = phy->id; + ed->address = phy->mdio_address; + } + + /* Reset phy. */ + ixge_write_phy_reg (xd, XGE_PHY_DEV_TYPE_PHY_XS, XGE_PHY_CONTROL, + XGE_PHY_CONTROL_RESET); + + /* Wait for self-clearning reset bit to clear. */ + do + { + vlib_process_suspend (vm, 1e-3); + } + while (ixge_read_phy_reg (xd, XGE_PHY_DEV_TYPE_PHY_XS, XGE_PHY_CONTROL) & + XGE_PHY_CONTROL_RESET); +} + +static u8 * +format_ixge_rx_from_hw_descriptor (u8 * s, va_list * va) +{ + ixge_rx_from_hw_descriptor_t *d = + va_arg (*va, ixge_rx_from_hw_descriptor_t *); + u32 s0 = d->status[0], s2 = d->status[2]; + u32 is_ip4, is_ip6, is_ip, is_tcp, is_udp; + uword indent = format_get_indent (s); + + s = format (s, "%s-owned", + (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_OWNED_BY_SOFTWARE) ? "sw" : + "hw"); + s = + format (s, ", length this descriptor %d, l3 offset %d", + d->n_packet_bytes_this_descriptor, + IXGE_RX_DESCRIPTOR_STATUS0_L3_OFFSET (s0)); + if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_END_OF_PACKET) + s = format (s, ", end-of-packet"); + + s = format (s, "\n%U", format_white_space, indent); + + if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_ETHERNET_ERROR) + s = format (s, "layer2 error"); + + if (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_LAYER2) + { + s = format (s, "layer 2 type %d", (s0 & 0x1f)); + return s; + } + + if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_VLAN) + s = format (s, "vlan header 0x%x\n%U", d->vlan_tag, + format_white_space, indent); + + if ((is_ip4 = (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP4))) + { + s = format (s, "ip4%s", + (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP4_EXT) ? " options" : + ""); + if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_IP4_CHECKSUMMED) + s = format (s, " checksum %s", + (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IP4_CHECKSUM_ERROR) ? + "bad" : "ok"); + } + if ((is_ip6 = (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6))) + s = format (s, "ip6%s", + (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6_EXT) ? " extended" : + ""); + is_tcp = is_udp = 0; + if ((is_ip = (is_ip4 | is_ip6))) + { + is_tcp = (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_TCP) != 0; + is_udp = (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_UDP) != 0; + if (is_tcp) + s = format (s, ", tcp"); + if (is_udp) + s = format (s, ", udp"); + } + + if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_TCP_CHECKSUMMED) + s = format (s, ", tcp checksum %s", + (s2 & IXGE_RX_DESCRIPTOR_STATUS2_TCP_CHECKSUM_ERROR) ? "bad" : + "ok"); + if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_UDP_CHECKSUMMED) + s = format (s, ", udp checksum %s", + (s2 & IXGE_RX_DESCRIPTOR_STATUS2_UDP_CHECKSUM_ERROR) ? "bad" : + "ok"); + + return s; +} + +static u8 * +format_ixge_tx_descriptor (u8 * s, va_list * va) +{ + ixge_tx_descriptor_t *d = va_arg (*va, ixge_tx_descriptor_t *); + u32 s0 = d->status0, s1 = d->status1; + uword indent = format_get_indent (s); + u32 v; + + s = format (s, "buffer 0x%Lx, %d packet bytes, %d bytes this buffer", + d->buffer_address, s1 >> 14, d->n_bytes_this_buffer); + + s = format (s, "\n%U", format_white_space, indent); + + if ((v = (s0 >> 0) & 3)) + s = format (s, "reserved 0x%x, ", v); + + if ((v = (s0 >> 2) & 3)) + s = format (s, "mac 0x%x, ", v); + + if ((v = (s0 >> 4) & 0xf) != 3) + s = format (s, "type 0x%x, ", v); + + s = format (s, "%s%s%s%s%s%s%s%s", + (s0 & (1 << 8)) ? "eop, " : "", + (s0 & (1 << 9)) ? "insert-fcs, " : "", + (s0 & (1 << 10)) ? "reserved26, " : "", + (s0 & (1 << 11)) ? "report-status, " : "", + (s0 & (1 << 12)) ? "reserved28, " : "", + (s0 & (1 << 13)) ? "is-advanced, " : "", + (s0 & (1 << 14)) ? "vlan-enable, " : "", + (s0 & (1 << 15)) ? "tx-segmentation, " : ""); + + if ((v = s1 & 0xf) != 0) + s = format (s, "status 0x%x, ", v); + + if ((v = (s1 >> 4) & 0xf)) + s = format (s, "context 0x%x, ", v); + + if ((v = (s1 >> 8) & 0x3f)) + s = format (s, "options 0x%x, ", v); + + return s; +} + +typedef struct +{ + ixge_descriptor_t before, after; + + u32 buffer_index; + + u16 device_index; + + u8 queue_index; + + u8 is_start_of_packet; + + /* Copy of VLIB buffer; packet data stored in pre_data. */ + vlib_buffer_t buffer; +} ixge_rx_dma_trace_t; + +static u8 * +format_ixge_rx_dma_trace (u8 * s, va_list * va) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *); + vlib_node_t *node = va_arg (*va, vlib_node_t *); + vnet_main_t *vnm = vnet_get_main (); + ixge_rx_dma_trace_t *t = va_arg (*va, ixge_rx_dma_trace_t *); + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd = vec_elt_at_index (xm->devices, t->device_index); + format_function_t *f; + uword indent = format_get_indent (s); + + { + vnet_sw_interface_t *sw = + vnet_get_sw_interface (vnm, xd->vlib_sw_if_index); + s = + format (s, "%U rx queue %d", format_vnet_sw_interface_name, vnm, sw, + t->queue_index); + } + + s = format (s, "\n%Ubefore: %U", + format_white_space, indent, + format_ixge_rx_from_hw_descriptor, &t->before); + s = format (s, "\n%Uafter : head/tail address 0x%Lx/0x%Lx", + format_white_space, indent, + t->after.rx_to_hw.head_address, t->after.rx_to_hw.tail_address); + + s = format (s, "\n%Ubuffer 0x%x: %U", + format_white_space, indent, + t->buffer_index, format_vlib_buffer, &t->buffer); + + s = format (s, "\n%U", format_white_space, indent); + + f = node->format_buffer; + if (!f || !t->is_start_of_packet) + f = format_hex_bytes; + s = format (s, "%U", f, t->buffer.pre_data, sizeof (t->buffer.pre_data)); + + return s; +} + +#define foreach_ixge_error \ + _ (none, "no error") \ + _ (tx_full_drops, "tx ring full drops") \ + _ (ip4_checksum_error, "ip4 checksum errors") \ + _ (rx_alloc_fail, "rx buf alloc from free list failed") \ + _ (rx_alloc_no_physmem, "rx buf alloc failed no physmem") + +typedef enum +{ +#define _(f,s) IXGE_ERROR_##f, + foreach_ixge_error +#undef _ + IXGE_N_ERROR, +} ixge_error_t; + +always_inline void +ixge_rx_next_and_error_from_status_x1 (ixge_device_t * xd, + u32 s00, u32 s02, + u8 * next0, u8 * error0, u32 * flags0) +{ + u8 is0_ip4, is0_ip6, n0, e0; + u32 f0; + + e0 = IXGE_ERROR_none; + n0 = IXGE_RX_NEXT_ETHERNET_INPUT; + + is0_ip4 = s02 & IXGE_RX_DESCRIPTOR_STATUS2_IS_IP4_CHECKSUMMED; + n0 = is0_ip4 ? IXGE_RX_NEXT_IP4_INPUT : n0; + + e0 = (is0_ip4 && (s02 & IXGE_RX_DESCRIPTOR_STATUS2_IP4_CHECKSUM_ERROR) + ? IXGE_ERROR_ip4_checksum_error : e0); + + is0_ip6 = s00 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6; + n0 = is0_ip6 ? IXGE_RX_NEXT_IP6_INPUT : n0; + + n0 = (xd->per_interface_next_index != ~0) ? + xd->per_interface_next_index : n0; + + /* Check for error. */ + n0 = e0 != IXGE_ERROR_none ? IXGE_RX_NEXT_DROP : n0; + + f0 = ((s02 & (IXGE_RX_DESCRIPTOR_STATUS2_IS_TCP_CHECKSUMMED + | IXGE_RX_DESCRIPTOR_STATUS2_IS_UDP_CHECKSUMMED)) + ? VNET_BUFFER_F_L4_CHECKSUM_COMPUTED : 0); + + f0 |= ((s02 & (IXGE_RX_DESCRIPTOR_STATUS2_TCP_CHECKSUM_ERROR + | IXGE_RX_DESCRIPTOR_STATUS2_UDP_CHECKSUM_ERROR)) + ? 0 : VNET_BUFFER_F_L4_CHECKSUM_CORRECT); + + *error0 = e0; + *next0 = n0; + *flags0 = f0; +} + +always_inline void +ixge_rx_next_and_error_from_status_x2 (ixge_device_t * xd, + u32 s00, u32 s02, + u32 s10, u32 s12, + u8 * next0, u8 * error0, u32 * flags0, + u8 * next1, u8 * error1, u32 * flags1) +{ + u8 is0_ip4, is0_ip6, n0, e0; + u8 is1_ip4, is1_ip6, n1, e1; + u32 f0, f1; + + e0 = e1 = IXGE_ERROR_none; + n0 = n1 = IXGE_RX_NEXT_IP4_INPUT; + + is0_ip4 = s02 & IXGE_RX_DESCRIPTOR_STATUS2_IS_IP4_CHECKSUMMED; + is1_ip4 = s12 & IXGE_RX_DESCRIPTOR_STATUS2_IS_IP4_CHECKSUMMED; + + n0 = is0_ip4 ? IXGE_RX_NEXT_IP4_INPUT : n0; + n1 = is1_ip4 ? IXGE_RX_NEXT_IP4_INPUT : n1; + + e0 = (is0_ip4 && (s02 & IXGE_RX_DESCRIPTOR_STATUS2_IP4_CHECKSUM_ERROR) + ? IXGE_ERROR_ip4_checksum_error : e0); + e1 = (is1_ip4 && (s12 & IXGE_RX_DESCRIPTOR_STATUS2_IP4_CHECKSUM_ERROR) + ? IXGE_ERROR_ip4_checksum_error : e1); + + is0_ip6 = s00 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6; + is1_ip6 = s10 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6; + + n0 = is0_ip6 ? IXGE_RX_NEXT_IP6_INPUT : n0; + n1 = is1_ip6 ? IXGE_RX_NEXT_IP6_INPUT : n1; + + n0 = (xd->per_interface_next_index != ~0) ? + xd->per_interface_next_index : n0; + n1 = (xd->per_interface_next_index != ~0) ? + xd->per_interface_next_index : n1; + + /* Check for error. */ + n0 = e0 != IXGE_ERROR_none ? IXGE_RX_NEXT_DROP : n0; + n1 = e1 != IXGE_ERROR_none ? IXGE_RX_NEXT_DROP : n1; + + *error0 = e0; + *error1 = e1; + + *next0 = n0; + *next1 = n1; + + f0 = ((s02 & (IXGE_RX_DESCRIPTOR_STATUS2_IS_TCP_CHECKSUMMED + | IXGE_RX_DESCRIPTOR_STATUS2_IS_UDP_CHECKSUMMED)) + ? VNET_BUFFER_F_L4_CHECKSUM_COMPUTED : 0); + f1 = ((s12 & (IXGE_RX_DESCRIPTOR_STATUS2_IS_TCP_CHECKSUMMED + | IXGE_RX_DESCRIPTOR_STATUS2_IS_UDP_CHECKSUMMED)) + ? VNET_BUFFER_F_L4_CHECKSUM_COMPUTED : 0); + + f0 |= ((s02 & (IXGE_RX_DESCRIPTOR_STATUS2_TCP_CHECKSUM_ERROR + | IXGE_RX_DESCRIPTOR_STATUS2_UDP_CHECKSUM_ERROR)) + ? 0 : VNET_BUFFER_F_L4_CHECKSUM_CORRECT); + f1 |= ((s12 & (IXGE_RX_DESCRIPTOR_STATUS2_TCP_CHECKSUM_ERROR + | IXGE_RX_DESCRIPTOR_STATUS2_UDP_CHECKSUM_ERROR)) + ? 0 : VNET_BUFFER_F_L4_CHECKSUM_CORRECT); + + *flags0 = f0; + *flags1 = f1; +} + +static void +ixge_rx_trace (ixge_main_t * xm, + ixge_device_t * xd, + ixge_dma_queue_t * dq, + ixge_descriptor_t * before_descriptors, + u32 * before_buffers, + ixge_descriptor_t * after_descriptors, uword n_descriptors) +{ + vlib_main_t *vm = xm->vlib_main; + vlib_node_runtime_t *node = dq->rx.node; + ixge_rx_from_hw_descriptor_t *bd; + ixge_rx_to_hw_descriptor_t *ad; + u32 *b, n_left, is_sop, next_index_sop; + + n_left = n_descriptors; + b = before_buffers; + bd = &before_descriptors->rx_from_hw; + ad = &after_descriptors->rx_to_hw; + is_sop = dq->rx.is_start_of_packet; + next_index_sop = dq->rx.saved_start_of_packet_next_index; + + while (n_left >= 2) + { + u32 bi0, bi1, flags0, flags1; + vlib_buffer_t *b0, *b1; + ixge_rx_dma_trace_t *t0, *t1; + u8 next0, error0, next1, error1; + + bi0 = b[0]; + bi1 = b[1]; + n_left -= 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + ixge_rx_next_and_error_from_status_x2 (xd, + bd[0].status[0], bd[0].status[2], + bd[1].status[0], bd[1].status[2], + &next0, &error0, &flags0, + &next1, &error1, &flags1); + + next_index_sop = is_sop ? next0 : next_index_sop; + vlib_trace_buffer (vm, node, next_index_sop, b0, /* follow_chain */ 0); + t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0])); + t0->is_start_of_packet = is_sop; + is_sop = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; + + next_index_sop = is_sop ? next1 : next_index_sop; + vlib_trace_buffer (vm, node, next_index_sop, b1, /* follow_chain */ 0); + t1 = vlib_add_trace (vm, node, b1, sizeof (t1[0])); + t1->is_start_of_packet = is_sop; + is_sop = (b1->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; + + t0->queue_index = dq->queue_index; + t1->queue_index = dq->queue_index; + t0->device_index = xd->device_index; + t1->device_index = xd->device_index; + t0->before.rx_from_hw = bd[0]; + t1->before.rx_from_hw = bd[1]; + t0->after.rx_to_hw = ad[0]; + t1->after.rx_to_hw = ad[1]; + t0->buffer_index = bi0; + t1->buffer_index = bi1; + memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data)); + memcpy (&t1->buffer, b1, sizeof (b1[0]) - sizeof (b0->pre_data)); + memcpy (t0->buffer.pre_data, b0->data + b0->current_data, + sizeof (t0->buffer.pre_data)); + memcpy (t1->buffer.pre_data, b1->data + b1->current_data, + sizeof (t1->buffer.pre_data)); + + b += 2; + bd += 2; + ad += 2; + } + + while (n_left >= 1) + { + u32 bi0, flags0; + vlib_buffer_t *b0; + ixge_rx_dma_trace_t *t0; + u8 next0, error0; + + bi0 = b[0]; + n_left -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + ixge_rx_next_and_error_from_status_x1 (xd, + bd[0].status[0], bd[0].status[2], + &next0, &error0, &flags0); + + next_index_sop = is_sop ? next0 : next_index_sop; + vlib_trace_buffer (vm, node, next_index_sop, b0, /* follow_chain */ 0); + t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0])); + t0->is_start_of_packet = is_sop; + is_sop = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; + + t0->queue_index = dq->queue_index; + t0->device_index = xd->device_index; + t0->before.rx_from_hw = bd[0]; + t0->after.rx_to_hw = ad[0]; + t0->buffer_index = bi0; + memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data)); + memcpy (t0->buffer.pre_data, b0->data + b0->current_data, + sizeof (t0->buffer.pre_data)); + + b += 1; + bd += 1; + ad += 1; + } +} + +typedef struct +{ + ixge_tx_descriptor_t descriptor; + + u32 buffer_index; + + u16 device_index; + + u8 queue_index; + + u8 is_start_of_packet; + + /* Copy of VLIB buffer; packet data stored in pre_data. */ + vlib_buffer_t buffer; +} ixge_tx_dma_trace_t; + +static u8 * +format_ixge_tx_dma_trace (u8 * s, va_list * va) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *); + ixge_tx_dma_trace_t *t = va_arg (*va, ixge_tx_dma_trace_t *); + vnet_main_t *vnm = vnet_get_main (); + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd = vec_elt_at_index (xm->devices, t->device_index); + format_function_t *f; + uword indent = format_get_indent (s); + + { + vnet_sw_interface_t *sw = + vnet_get_sw_interface (vnm, xd->vlib_sw_if_index); + s = + format (s, "%U tx queue %d", format_vnet_sw_interface_name, vnm, sw, + t->queue_index); + } + + s = format (s, "\n%Udescriptor: %U", + format_white_space, indent, + format_ixge_tx_descriptor, &t->descriptor); + + s = format (s, "\n%Ubuffer 0x%x: %U", + format_white_space, indent, + t->buffer_index, format_vlib_buffer, &t->buffer); + + s = format (s, "\n%U", format_white_space, indent); + + f = format_ethernet_header_with_length; + if (!f || !t->is_start_of_packet) + f = format_hex_bytes; + s = format (s, "%U", f, t->buffer.pre_data, sizeof (t->buffer.pre_data)); + + return s; +} + +typedef struct +{ + vlib_node_runtime_t *node; + + u32 is_start_of_packet; + + u32 n_bytes_in_packet; + + ixge_tx_descriptor_t *start_of_packet_descriptor; +} ixge_tx_state_t; + +static void +ixge_tx_trace (ixge_main_t * xm, + ixge_device_t * xd, + ixge_dma_queue_t * dq, + ixge_tx_state_t * tx_state, + ixge_tx_descriptor_t * descriptors, + u32 * buffers, uword n_descriptors) +{ + vlib_main_t *vm = xm->vlib_main; + vlib_node_runtime_t *node = tx_state->node; + ixge_tx_descriptor_t *d; + u32 *b, n_left, is_sop; + + n_left = n_descriptors; + b = buffers; + d = descriptors; + is_sop = tx_state->is_start_of_packet; + + while (n_left >= 2) + { + u32 bi0, bi1; + vlib_buffer_t *b0, *b1; + ixge_tx_dma_trace_t *t0, *t1; + + bi0 = b[0]; + bi1 = b[1]; + n_left -= 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0])); + t0->is_start_of_packet = is_sop; + is_sop = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; + + t1 = vlib_add_trace (vm, node, b1, sizeof (t1[0])); + t1->is_start_of_packet = is_sop; + is_sop = (b1->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; + + t0->queue_index = dq->queue_index; + t1->queue_index = dq->queue_index; + t0->device_index = xd->device_index; + t1->device_index = xd->device_index; + t0->descriptor = d[0]; + t1->descriptor = d[1]; + t0->buffer_index = bi0; + t1->buffer_index = bi1; + memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data)); + memcpy (&t1->buffer, b1, sizeof (b1[0]) - sizeof (b0->pre_data)); + memcpy (t0->buffer.pre_data, b0->data + b0->current_data, + sizeof (t0->buffer.pre_data)); + memcpy (t1->buffer.pre_data, b1->data + b1->current_data, + sizeof (t1->buffer.pre_data)); + + b += 2; + d += 2; + } + + while (n_left >= 1) + { + u32 bi0; + vlib_buffer_t *b0; + ixge_tx_dma_trace_t *t0; + + bi0 = b[0]; + n_left -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0])); + t0->is_start_of_packet = is_sop; + is_sop = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; + + t0->queue_index = dq->queue_index; + t0->device_index = xd->device_index; + t0->descriptor = d[0]; + t0->buffer_index = bi0; + memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data)); + memcpy (t0->buffer.pre_data, b0->data + b0->current_data, + sizeof (t0->buffer.pre_data)); + + b += 1; + d += 1; + } +} + +always_inline uword +ixge_ring_sub (ixge_dma_queue_t * q, u32 i0, u32 i1) +{ + i32 d = i1 - i0; + ASSERT (i0 < q->n_descriptors); + ASSERT (i1 < q->n_descriptors); + return d < 0 ? q->n_descriptors + d : d; +} + +always_inline uword +ixge_ring_add (ixge_dma_queue_t * q, u32 i0, u32 i1) +{ + u32 d = i0 + i1; + ASSERT (i0 < q->n_descriptors); + ASSERT (i1 < q->n_descriptors); + d -= d >= q->n_descriptors ? q->n_descriptors : 0; + return d; +} + +always_inline uword +ixge_tx_descriptor_matches_template (ixge_main_t * xm, + ixge_tx_descriptor_t * d) +{ + u32 cmp; + + cmp = ((d->status0 & xm->tx_descriptor_template_mask.status0) + ^ xm->tx_descriptor_template.status0); + if (cmp) + return 0; + cmp = ((d->status1 & xm->tx_descriptor_template_mask.status1) + ^ xm->tx_descriptor_template.status1); + if (cmp) + return 0; + + return 1; +} + +static uword +ixge_tx_no_wrap (ixge_main_t * xm, + ixge_device_t * xd, + ixge_dma_queue_t * dq, + u32 * buffers, + u32 start_descriptor_index, + u32 n_descriptors, ixge_tx_state_t * tx_state) +{ + vlib_main_t *vm = xm->vlib_main; + ixge_tx_descriptor_t *d, *d_sop; + u32 n_left = n_descriptors; + u32 *to_free = vec_end (xm->tx_buffers_pending_free); + u32 *to_tx = + vec_elt_at_index (dq->descriptor_buffer_indices, start_descriptor_index); + u32 is_sop = tx_state->is_start_of_packet; + u32 len_sop = tx_state->n_bytes_in_packet; + u16 template_status = xm->tx_descriptor_template.status0; + u32 descriptor_prefetch_rotor = 0; + + ASSERT (start_descriptor_index + n_descriptors <= dq->n_descriptors); + d = &dq->descriptors[start_descriptor_index].tx; + d_sop = is_sop ? d : tx_state->start_of_packet_descriptor; + + while (n_left >= 4) + { + vlib_buffer_t *b0, *b1; + u32 bi0, fi0, len0; + u32 bi1, fi1, len1; + u8 is_eop0, is_eop1; + + /* Prefetch next iteration. */ + vlib_prefetch_buffer_with_index (vm, buffers[2], LOAD); + vlib_prefetch_buffer_with_index (vm, buffers[3], LOAD); + + if ((descriptor_prefetch_rotor & 0x3) == 0) + CLIB_PREFETCH (d + 4, CLIB_CACHE_LINE_BYTES, STORE); + + descriptor_prefetch_rotor += 2; + + bi0 = buffers[0]; + bi1 = buffers[1]; + + to_free[0] = fi0 = to_tx[0]; + to_tx[0] = bi0; + to_free += fi0 != 0; + + to_free[0] = fi1 = to_tx[1]; + to_tx[1] = bi1; + to_free += fi1 != 0; + + buffers += 2; + n_left -= 2; + to_tx += 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + is_eop0 = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; + is_eop1 = (b1->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; + + len0 = b0->current_length; + len1 = b1->current_length; + + ASSERT (ixge_tx_descriptor_matches_template (xm, d + 0)); + ASSERT (ixge_tx_descriptor_matches_template (xm, d + 1)); + + d[0].buffer_address = + vlib_get_buffer_data_physical_address (vm, bi0) + b0->current_data; + d[1].buffer_address = + vlib_get_buffer_data_physical_address (vm, bi1) + b1->current_data; + + d[0].n_bytes_this_buffer = len0; + d[1].n_bytes_this_buffer = len1; + + d[0].status0 = + template_status | (is_eop0 << + IXGE_TX_DESCRIPTOR_STATUS0_LOG2_IS_END_OF_PACKET); + d[1].status0 = + template_status | (is_eop1 << + IXGE_TX_DESCRIPTOR_STATUS0_LOG2_IS_END_OF_PACKET); + + len_sop = (is_sop ? 0 : len_sop) + len0; + d_sop[0].status1 = + IXGE_TX_DESCRIPTOR_STATUS1_N_BYTES_IN_PACKET (len_sop); + d += 1; + d_sop = is_eop0 ? d : d_sop; + + is_sop = is_eop0; + + len_sop = (is_sop ? 0 : len_sop) + len1; + d_sop[0].status1 = + IXGE_TX_DESCRIPTOR_STATUS1_N_BYTES_IN_PACKET (len_sop); + d += 1; + d_sop = is_eop1 ? d : d_sop; + + is_sop = is_eop1; + } + + while (n_left > 0) + { + vlib_buffer_t *b0; + u32 bi0, fi0, len0; + u8 is_eop0; + + bi0 = buffers[0]; + + to_free[0] = fi0 = to_tx[0]; + to_tx[0] = bi0; + to_free += fi0 != 0; + + buffers += 1; + n_left -= 1; + to_tx += 1; + + b0 = vlib_get_buffer (vm, bi0); + + is_eop0 = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; + + len0 = b0->current_length; + + ASSERT (ixge_tx_descriptor_matches_template (xm, d + 0)); + + d[0].buffer_address = + vlib_get_buffer_data_physical_address (vm, bi0) + b0->current_data; + + d[0].n_bytes_this_buffer = len0; + + d[0].status0 = + template_status | (is_eop0 << + IXGE_TX_DESCRIPTOR_STATUS0_LOG2_IS_END_OF_PACKET); + + len_sop = (is_sop ? 0 : len_sop) + len0; + d_sop[0].status1 = + IXGE_TX_DESCRIPTOR_STATUS1_N_BYTES_IN_PACKET (len_sop); + d += 1; + d_sop = is_eop0 ? d : d_sop; + + is_sop = is_eop0; + } + + if (tx_state->node->flags & VLIB_NODE_FLAG_TRACE) + { + to_tx = + vec_elt_at_index (dq->descriptor_buffer_indices, + start_descriptor_index); + ixge_tx_trace (xm, xd, dq, tx_state, + &dq->descriptors[start_descriptor_index].tx, to_tx, + n_descriptors); + } + + _vec_len (xm->tx_buffers_pending_free) = + to_free - xm->tx_buffers_pending_free; + + /* When we are done d_sop can point to end of ring. Wrap it if so. */ + { + ixge_tx_descriptor_t *d_start = &dq->descriptors[0].tx; + + ASSERT (d_sop - d_start <= dq->n_descriptors); + d_sop = d_sop - d_start == dq->n_descriptors ? d_start : d_sop; + } + + tx_state->is_start_of_packet = is_sop; + tx_state->start_of_packet_descriptor = d_sop; + tx_state->n_bytes_in_packet = len_sop; + + return n_descriptors; +} + +static uword +ixge_interface_tx (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * f) +{ + ixge_main_t *xm = &ixge_main; + vnet_interface_output_runtime_t *rd = (void *) node->runtime_data; + ixge_device_t *xd = vec_elt_at_index (xm->devices, rd->dev_instance); + ixge_dma_queue_t *dq; + u32 *from, n_left_tx, n_descriptors_to_tx, n_tail_drop; + u32 queue_index = 0; /* fixme parameter */ + ixge_tx_state_t tx_state; + + tx_state.node = node; + tx_state.is_start_of_packet = 1; + tx_state.start_of_packet_descriptor = 0; + tx_state.n_bytes_in_packet = 0; + + from = vlib_frame_vector_args (f); + + dq = vec_elt_at_index (xd->dma_queues[VLIB_TX], queue_index); + + dq->head_index = dq->tx.head_index_write_back[0]; + + /* Since head == tail means ring is empty we can send up to dq->n_descriptors - 1. */ + n_left_tx = dq->n_descriptors - 1; + n_left_tx -= ixge_ring_sub (dq, dq->head_index, dq->tail_index); + + _vec_len (xm->tx_buffers_pending_free) = 0; + + n_descriptors_to_tx = f->n_vectors; + n_tail_drop = 0; + if (PREDICT_FALSE (n_descriptors_to_tx > n_left_tx)) + { + i32 i, n_ok, i_eop, i_sop; + + i_sop = i_eop = ~0; + for (i = n_left_tx - 1; i >= 0; i--) + { + vlib_buffer_t *b = vlib_get_buffer (vm, from[i]); + if (!(b->flags & VLIB_BUFFER_NEXT_PRESENT)) + { + if (i_sop != ~0 && i_eop != ~0) + break; + i_eop = i; + i_sop = i + 1; + } + } + if (i == 0) + n_ok = 0; + else + n_ok = i_eop + 1; + + { + ELOG_TYPE_DECLARE (e) = + { + .function = (char *) __FUNCTION__,.format = + "ixge %d, ring full to tx %d head %d tail %d",.format_args = + "i2i2i2i2",}; + struct + { + u16 instance, to_tx, head, tail; + } *ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->instance = xd->device_index; + ed->to_tx = n_descriptors_to_tx; + ed->head = dq->head_index; + ed->tail = dq->tail_index; + } + + if (n_ok < n_descriptors_to_tx) + { + n_tail_drop = n_descriptors_to_tx - n_ok; + vec_add (xm->tx_buffers_pending_free, from + n_ok, n_tail_drop); + vlib_error_count (vm, ixge_input_node.index, + IXGE_ERROR_tx_full_drops, n_tail_drop); + } + + n_descriptors_to_tx = n_ok; + } + + dq->tx.n_buffers_on_ring += n_descriptors_to_tx; + + /* Process from tail to end of descriptor ring. */ + if (n_descriptors_to_tx > 0 && dq->tail_index < dq->n_descriptors) + { + u32 n = + clib_min (dq->n_descriptors - dq->tail_index, n_descriptors_to_tx); + n = ixge_tx_no_wrap (xm, xd, dq, from, dq->tail_index, n, &tx_state); + from += n; + n_descriptors_to_tx -= n; + dq->tail_index += n; + ASSERT (dq->tail_index <= dq->n_descriptors); + if (dq->tail_index == dq->n_descriptors) + dq->tail_index = 0; + } + + if (n_descriptors_to_tx > 0) + { + u32 n = + ixge_tx_no_wrap (xm, xd, dq, from, 0, n_descriptors_to_tx, &tx_state); + from += n; + ASSERT (n == n_descriptors_to_tx); + dq->tail_index += n; + ASSERT (dq->tail_index <= dq->n_descriptors); + if (dq->tail_index == dq->n_descriptors) + dq->tail_index = 0; + } + + /* We should only get full packets. */ + ASSERT (tx_state.is_start_of_packet); + + /* Report status when last descriptor is done. */ + { + u32 i = dq->tail_index == 0 ? dq->n_descriptors - 1 : dq->tail_index - 1; + ixge_tx_descriptor_t *d = &dq->descriptors[i].tx; + d->status0 |= IXGE_TX_DESCRIPTOR_STATUS0_REPORT_STATUS; + } + + /* Give new descriptors to hardware. */ + { + ixge_dma_regs_t *dr = get_dma_regs (xd, VLIB_TX, queue_index); + + CLIB_MEMORY_BARRIER (); + + dr->tail_index = dq->tail_index; + } + + /* Free any buffers that are done. */ + { + u32 n = _vec_len (xm->tx_buffers_pending_free); + if (n > 0) + { + vlib_buffer_free_no_next (vm, xm->tx_buffers_pending_free, n); + _vec_len (xm->tx_buffers_pending_free) = 0; + ASSERT (dq->tx.n_buffers_on_ring >= n); + dq->tx.n_buffers_on_ring -= (n - n_tail_drop); + } + } + + return f->n_vectors; +} + +static uword +ixge_rx_queue_no_wrap (ixge_main_t * xm, + ixge_device_t * xd, + ixge_dma_queue_t * dq, + u32 start_descriptor_index, u32 n_descriptors) +{ + vlib_main_t *vm = xm->vlib_main; + vlib_node_runtime_t *node = dq->rx.node; + ixge_descriptor_t *d; + static ixge_descriptor_t *d_trace_save; + static u32 *d_trace_buffers; + u32 n_descriptors_left = n_descriptors; + u32 *to_rx = + vec_elt_at_index (dq->descriptor_buffer_indices, start_descriptor_index); + u32 *to_add; + u32 bi_sop = dq->rx.saved_start_of_packet_buffer_index; + u32 bi_last = dq->rx.saved_last_buffer_index; + u32 next_index_sop = dq->rx.saved_start_of_packet_next_index; + u32 is_sop = dq->rx.is_start_of_packet; + u32 next_index, n_left_to_next, *to_next; + u32 n_packets = 0; + u32 n_bytes = 0; + u32 n_trace = vlib_get_trace_count (vm, node); + vlib_buffer_t *b_last, b_dummy; + + ASSERT (start_descriptor_index + n_descriptors <= dq->n_descriptors); + d = &dq->descriptors[start_descriptor_index]; + + b_last = bi_last != ~0 ? vlib_get_buffer (vm, bi_last) : &b_dummy; + next_index = dq->rx.next_index; + + if (n_trace > 0) + { + u32 n = clib_min (n_trace, n_descriptors); + if (d_trace_save) + { + _vec_len (d_trace_save) = 0; + _vec_len (d_trace_buffers) = 0; + } + vec_add (d_trace_save, (ixge_descriptor_t *) d, n); + vec_add (d_trace_buffers, to_rx, n); + } + + { + uword l = vec_len (xm->rx_buffers_to_add); + + if (l < n_descriptors_left) + { + u32 n_to_alloc = 2 * dq->n_descriptors - l; + u32 n_allocated; + + vec_resize (xm->rx_buffers_to_add, n_to_alloc); + + _vec_len (xm->rx_buffers_to_add) = l; + n_allocated = vlib_buffer_alloc_from_free_list + (vm, xm->rx_buffers_to_add + l, n_to_alloc, + xm->vlib_buffer_free_list_index); + _vec_len (xm->rx_buffers_to_add) += n_allocated; + + /* Handle transient allocation failure */ + if (PREDICT_FALSE (l + n_allocated <= n_descriptors_left)) + { + if (n_allocated == 0) + vlib_error_count (vm, ixge_input_node.index, + IXGE_ERROR_rx_alloc_no_physmem, 1); + else + vlib_error_count (vm, ixge_input_node.index, + IXGE_ERROR_rx_alloc_fail, 1); + + n_descriptors_left = l + n_allocated; + } + n_descriptors = n_descriptors_left; + } + + /* Add buffers from end of vector going backwards. */ + to_add = vec_end (xm->rx_buffers_to_add) - 1; + } + + while (n_descriptors_left > 0) + { + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_descriptors_left >= 4 && n_left_to_next >= 2) + { + vlib_buffer_t *b0, *b1; + u32 bi0, fi0, len0, l3_offset0, s20, s00, flags0; + u32 bi1, fi1, len1, l3_offset1, s21, s01, flags1; + u8 is_eop0, error0, next0; + u8 is_eop1, error1, next1; + ixge_descriptor_t d0, d1; + + vlib_prefetch_buffer_with_index (vm, to_rx[2], STORE); + vlib_prefetch_buffer_with_index (vm, to_rx[3], STORE); + + CLIB_PREFETCH (d + 2, 32, STORE); + + d0.as_u32x4 = d[0].as_u32x4; + d1.as_u32x4 = d[1].as_u32x4; + + s20 = d0.rx_from_hw.status[2]; + s21 = d1.rx_from_hw.status[2]; + + s00 = d0.rx_from_hw.status[0]; + s01 = d1.rx_from_hw.status[0]; + + if (! + ((s20 & s21) & IXGE_RX_DESCRIPTOR_STATUS2_IS_OWNED_BY_SOFTWARE)) + goto found_hw_owned_descriptor_x2; + + bi0 = to_rx[0]; + bi1 = to_rx[1]; + + ASSERT (to_add - 1 >= xm->rx_buffers_to_add); + fi0 = to_add[0]; + fi1 = to_add[-1]; + + to_rx[0] = fi0; + to_rx[1] = fi1; + to_rx += 2; + to_add -= 2; + + ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED == + vlib_buffer_is_known (vm, bi0)); + ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED == + vlib_buffer_is_known (vm, bi1)); + ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED == + vlib_buffer_is_known (vm, fi0)); + ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED == + vlib_buffer_is_known (vm, fi1)); + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + /* + * Turn this on if you run into + * "bad monkey" contexts, and you want to know exactly + * which nodes they've visited... See main.c... + */ + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b1); + + CLIB_PREFETCH (b0->data, CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (b1->data, CLIB_CACHE_LINE_BYTES, LOAD); + + is_eop0 = (s20 & IXGE_RX_DESCRIPTOR_STATUS2_IS_END_OF_PACKET) != 0; + is_eop1 = (s21 & IXGE_RX_DESCRIPTOR_STATUS2_IS_END_OF_PACKET) != 0; + + ixge_rx_next_and_error_from_status_x2 (xd, s00, s20, s01, s21, + &next0, &error0, &flags0, + &next1, &error1, &flags1); + + next0 = is_sop ? next0 : next_index_sop; + next1 = is_eop0 ? next1 : next0; + next_index_sop = next1; + + b0->flags |= flags0 | (!is_eop0 << VLIB_BUFFER_LOG2_NEXT_PRESENT); + b1->flags |= flags1 | (!is_eop1 << VLIB_BUFFER_LOG2_NEXT_PRESENT); + + vnet_buffer (b0)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index; + vnet_buffer (b1)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index; + vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; + vnet_buffer (b1)->sw_if_index[VLIB_TX] = (u32) ~ 0; + + b0->error = node->errors[error0]; + b1->error = node->errors[error1]; + + len0 = d0.rx_from_hw.n_packet_bytes_this_descriptor; + len1 = d1.rx_from_hw.n_packet_bytes_this_descriptor; + n_bytes += len0 + len1; + n_packets += is_eop0 + is_eop1; + + /* Give new buffers to hardware. */ + d0.rx_to_hw.tail_address = + vlib_get_buffer_data_physical_address (vm, fi0); + d1.rx_to_hw.tail_address = + vlib_get_buffer_data_physical_address (vm, fi1); + d0.rx_to_hw.head_address = d[0].rx_to_hw.tail_address; + d1.rx_to_hw.head_address = d[1].rx_to_hw.tail_address; + d[0].as_u32x4 = d0.as_u32x4; + d[1].as_u32x4 = d1.as_u32x4; + + d += 2; + n_descriptors_left -= 2; + + /* Point to either l2 or l3 header depending on next. */ + l3_offset0 = (is_sop && (next0 != IXGE_RX_NEXT_ETHERNET_INPUT)) + ? IXGE_RX_DESCRIPTOR_STATUS0_L3_OFFSET (s00) : 0; + l3_offset1 = (is_eop0 && (next1 != IXGE_RX_NEXT_ETHERNET_INPUT)) + ? IXGE_RX_DESCRIPTOR_STATUS0_L3_OFFSET (s01) : 0; + + b0->current_length = len0 - l3_offset0; + b1->current_length = len1 - l3_offset1; + b0->current_data = l3_offset0; + b1->current_data = l3_offset1; + + b_last->next_buffer = is_sop ? ~0 : bi0; + b0->next_buffer = is_eop0 ? ~0 : bi1; + bi_last = bi1; + b_last = b1; + + if (CLIB_DEBUG > 0) + { + u32 bi_sop0 = is_sop ? bi0 : bi_sop; + u32 bi_sop1 = is_eop0 ? bi1 : bi_sop0; + + if (is_eop0) + { + u8 *msg = vlib_validate_buffer (vm, bi_sop0, + /* follow_buffer_next */ 1); + ASSERT (!msg); + } + if (is_eop1) + { + u8 *msg = vlib_validate_buffer (vm, bi_sop1, + /* follow_buffer_next */ 1); + ASSERT (!msg); + } + } + if (0) /* "Dave" version */ + { + u32 bi_sop0 = is_sop ? bi0 : bi_sop; + u32 bi_sop1 = is_eop0 ? bi1 : bi_sop0; + + if (is_eop0) + { + to_next[0] = bi_sop0; + to_next++; + n_left_to_next--; + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi_sop0, next0); + } + if (is_eop1) + { + to_next[0] = bi_sop1; + to_next++; + n_left_to_next--; + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi_sop1, next1); + } + is_sop = is_eop1; + bi_sop = bi_sop1; + } + if (1) /* "Eliot" version */ + { + /* Speculatively enqueue to cached next. */ + u8 saved_is_sop = is_sop; + u32 bi_sop_save = bi_sop; + + bi_sop = saved_is_sop ? bi0 : bi_sop; + to_next[0] = bi_sop; + to_next += is_eop0; + n_left_to_next -= is_eop0; + + bi_sop = is_eop0 ? bi1 : bi_sop; + to_next[0] = bi_sop; + to_next += is_eop1; + n_left_to_next -= is_eop1; + + is_sop = is_eop1; + + if (PREDICT_FALSE + (!(next0 == next_index && next1 == next_index))) + { + /* Undo speculation. */ + to_next -= is_eop0 + is_eop1; + n_left_to_next += is_eop0 + is_eop1; + + /* Re-do both descriptors being careful about where we enqueue. */ + bi_sop = saved_is_sop ? bi0 : bi_sop_save; + if (is_eop0) + { + if (next0 != next_index) + vlib_set_next_frame_buffer (vm, node, next0, bi_sop); + else + { + to_next[0] = bi_sop; + to_next += 1; + n_left_to_next -= 1; + } + } + + bi_sop = is_eop0 ? bi1 : bi_sop; + if (is_eop1) + { + if (next1 != next_index) + vlib_set_next_frame_buffer (vm, node, next1, bi_sop); + else + { + to_next[0] = bi_sop; + to_next += 1; + n_left_to_next -= 1; + } + } + + /* Switch cached next index when next for both packets is the same. */ + if (is_eop0 && is_eop1 && next0 == next1) + { + vlib_put_next_frame (vm, node, next_index, + n_left_to_next); + next_index = next0; + vlib_get_next_frame (vm, node, next_index, + to_next, n_left_to_next); + } + } + } + } + + /* Bail out of dual loop and proceed with single loop. */ + found_hw_owned_descriptor_x2: + + while (n_descriptors_left > 0 && n_left_to_next > 0) + { + vlib_buffer_t *b0; + u32 bi0, fi0, len0, l3_offset0, s20, s00, flags0; + u8 is_eop0, error0, next0; + ixge_descriptor_t d0; + + d0.as_u32x4 = d[0].as_u32x4; + + s20 = d0.rx_from_hw.status[2]; + s00 = d0.rx_from_hw.status[0]; + + if (!(s20 & IXGE_RX_DESCRIPTOR_STATUS2_IS_OWNED_BY_SOFTWARE)) + goto found_hw_owned_descriptor_x1; + + bi0 = to_rx[0]; + ASSERT (to_add >= xm->rx_buffers_to_add); + fi0 = to_add[0]; + + to_rx[0] = fi0; + to_rx += 1; + to_add -= 1; + + ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED == + vlib_buffer_is_known (vm, bi0)); + ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED == + vlib_buffer_is_known (vm, fi0)); + + b0 = vlib_get_buffer (vm, bi0); + + /* + * Turn this on if you run into + * "bad monkey" contexts, and you want to know exactly + * which nodes they've visited... + */ + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); + + is_eop0 = (s20 & IXGE_RX_DESCRIPTOR_STATUS2_IS_END_OF_PACKET) != 0; + ixge_rx_next_and_error_from_status_x1 + (xd, s00, s20, &next0, &error0, &flags0); + + next0 = is_sop ? next0 : next_index_sop; + next_index_sop = next0; + + b0->flags |= flags0 | (!is_eop0 << VLIB_BUFFER_LOG2_NEXT_PRESENT); + + vnet_buffer (b0)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index; + vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; + + b0->error = node->errors[error0]; + + len0 = d0.rx_from_hw.n_packet_bytes_this_descriptor; + n_bytes += len0; + n_packets += is_eop0; + + /* Give new buffer to hardware. */ + d0.rx_to_hw.tail_address = + vlib_get_buffer_data_physical_address (vm, fi0); + d0.rx_to_hw.head_address = d0.rx_to_hw.tail_address; + d[0].as_u32x4 = d0.as_u32x4; + + d += 1; + n_descriptors_left -= 1; + + /* Point to either l2 or l3 header depending on next. */ + l3_offset0 = (is_sop && (next0 != IXGE_RX_NEXT_ETHERNET_INPUT)) + ? IXGE_RX_DESCRIPTOR_STATUS0_L3_OFFSET (s00) : 0; + b0->current_length = len0 - l3_offset0; + b0->current_data = l3_offset0; + + b_last->next_buffer = is_sop ? ~0 : bi0; + bi_last = bi0; + b_last = b0; + + bi_sop = is_sop ? bi0 : bi_sop; + + if (CLIB_DEBUG > 0 && is_eop0) + { + u8 *msg = + vlib_validate_buffer (vm, bi_sop, /* follow_buffer_next */ 1); + ASSERT (!msg); + } + + if (0) /* "Dave" version */ + { + if (is_eop0) + { + to_next[0] = bi_sop; + to_next++; + n_left_to_next--; + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi_sop, next0); + } + } + if (1) /* "Eliot" version */ + { + if (PREDICT_TRUE (next0 == next_index)) + { + to_next[0] = bi_sop; + to_next += is_eop0; + n_left_to_next -= is_eop0; + } + else + { + if (next0 != next_index && is_eop0) + vlib_set_next_frame_buffer (vm, node, next0, bi_sop); + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + next_index = next0; + vlib_get_next_frame (vm, node, next_index, + to_next, n_left_to_next); + } + } + is_sop = is_eop0; + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + +found_hw_owned_descriptor_x1: + if (n_descriptors_left > 0) + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + + _vec_len (xm->rx_buffers_to_add) = (to_add + 1) - xm->rx_buffers_to_add; + + { + u32 n_done = n_descriptors - n_descriptors_left; + + if (n_trace > 0 && n_done > 0) + { + u32 n = clib_min (n_trace, n_done); + ixge_rx_trace (xm, xd, dq, + d_trace_save, + d_trace_buffers, + &dq->descriptors[start_descriptor_index], n); + vlib_set_trace_count (vm, node, n_trace - n); + } + if (d_trace_save) + { + _vec_len (d_trace_save) = 0; + _vec_len (d_trace_buffers) = 0; + } + + /* Don't keep a reference to b_last if we don't have to. + Otherwise we can over-write a next_buffer pointer after already haven + enqueued a packet. */ + if (is_sop) + { + b_last->next_buffer = ~0; + bi_last = ~0; + } + + dq->rx.n_descriptors_done_this_call = n_done; + dq->rx.n_descriptors_done_total += n_done; + dq->rx.is_start_of_packet = is_sop; + dq->rx.saved_start_of_packet_buffer_index = bi_sop; + dq->rx.saved_last_buffer_index = bi_last; + dq->rx.saved_start_of_packet_next_index = next_index_sop; + dq->rx.next_index = next_index; + dq->rx.n_bytes += n_bytes; + + return n_packets; + } +} + +static uword +ixge_rx_queue (ixge_main_t * xm, + ixge_device_t * xd, + vlib_node_runtime_t * node, u32 queue_index) +{ + ixge_dma_queue_t *dq = + vec_elt_at_index (xd->dma_queues[VLIB_RX], queue_index); + ixge_dma_regs_t *dr = get_dma_regs (xd, VLIB_RX, dq->queue_index); + uword n_packets = 0; + u32 hw_head_index, sw_head_index; + + /* One time initialization. */ + if (!dq->rx.node) + { + dq->rx.node = node; + dq->rx.is_start_of_packet = 1; + dq->rx.saved_start_of_packet_buffer_index = ~0; + dq->rx.saved_last_buffer_index = ~0; + } + + dq->rx.next_index = node->cached_next_index; + + dq->rx.n_descriptors_done_total = 0; + dq->rx.n_descriptors_done_this_call = 0; + dq->rx.n_bytes = 0; + + /* Fetch head from hardware and compare to where we think we are. */ + hw_head_index = dr->head_index; + sw_head_index = dq->head_index; + + if (hw_head_index == sw_head_index) + goto done; + + if (hw_head_index < sw_head_index) + { + u32 n_tried = dq->n_descriptors - sw_head_index; + n_packets += ixge_rx_queue_no_wrap (xm, xd, dq, sw_head_index, n_tried); + sw_head_index = + ixge_ring_add (dq, sw_head_index, + dq->rx.n_descriptors_done_this_call); + + if (dq->rx.n_descriptors_done_this_call != n_tried) + goto done; + } + if (hw_head_index >= sw_head_index) + { + u32 n_tried = hw_head_index - sw_head_index; + n_packets += ixge_rx_queue_no_wrap (xm, xd, dq, sw_head_index, n_tried); + sw_head_index = + ixge_ring_add (dq, sw_head_index, + dq->rx.n_descriptors_done_this_call); + } + +done: + dq->head_index = sw_head_index; + dq->tail_index = + ixge_ring_add (dq, dq->tail_index, dq->rx.n_descriptors_done_total); + + /* Give tail back to hardware. */ + CLIB_MEMORY_BARRIER (); + + dr->tail_index = dq->tail_index; + + vlib_increment_combined_counter (vnet_main. + interface_main.combined_sw_if_counters + + VNET_INTERFACE_COUNTER_RX, + 0 /* thread_index */ , + xd->vlib_sw_if_index, n_packets, + dq->rx.n_bytes); + + return n_packets; +} + +static void +ixge_interrupt (ixge_main_t * xm, ixge_device_t * xd, u32 i) +{ + vlib_main_t *vm = xm->vlib_main; + ixge_regs_t *r = xd->regs; + + if (i != 20) + { + ELOG_TYPE_DECLARE (e) = + { + .function = (char *) __FUNCTION__,.format = + "ixge %d, %s",.format_args = "i1t1",.n_enum_strings = + 16,.enum_strings = + { + "flow director", + "rx miss", + "pci exception", + "mailbox", + "link status change", + "linksec key exchange", + "manageability event", + "reserved23", + "sdp0", + "sdp1", + "sdp2", + "sdp3", + "ecc", "descriptor handler error", "tcp timer", "other",},}; + struct + { + u8 instance; + u8 index; + } *ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->instance = xd->device_index; + ed->index = i - 16; + } + else + { + u32 v = r->xge_mac.link_status; + uword is_up = (v & (1 << 30)) != 0; + + ELOG_TYPE_DECLARE (e) = + { + .function = (char *) __FUNCTION__,.format = + "ixge %d, link status change 0x%x",.format_args = "i4i4",}; + struct + { + u32 instance, link_status; + } *ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->instance = xd->device_index; + ed->link_status = v; + xd->link_status_at_last_link_change = v; + + vlib_process_signal_event (vm, ixge_process_node.index, + EVENT_SET_FLAGS, + ((is_up << 31) | xd->vlib_hw_if_index)); + } +} + +always_inline u32 +clean_block (u32 * b, u32 * t, u32 n_left) +{ + u32 *t0 = t; + + while (n_left >= 4) + { + u32 bi0, bi1, bi2, bi3; + + t[0] = bi0 = b[0]; + b[0] = 0; + t += bi0 != 0; + + t[0] = bi1 = b[1]; + b[1] = 0; + t += bi1 != 0; + + t[0] = bi2 = b[2]; + b[2] = 0; + t += bi2 != 0; + + t[0] = bi3 = b[3]; + b[3] = 0; + t += bi3 != 0; + + b += 4; + n_left -= 4; + } + + while (n_left > 0) + { + u32 bi0; + + t[0] = bi0 = b[0]; + b[0] = 0; + t += bi0 != 0; + b += 1; + n_left -= 1; + } + + return t - t0; +} + +static void +ixge_tx_queue (ixge_main_t * xm, ixge_device_t * xd, u32 queue_index) +{ + vlib_main_t *vm = xm->vlib_main; + ixge_dma_queue_t *dq = + vec_elt_at_index (xd->dma_queues[VLIB_TX], queue_index); + u32 n_clean, *b, *t, *t0; + i32 n_hw_owned_descriptors; + i32 first_to_clean, last_to_clean; + u64 hwbp_race = 0; + + /* Handle case where head write back pointer update + * arrives after the interrupt during high PCI bus loads. + */ + while ((dq->head_index == dq->tx.head_index_write_back[0]) && + dq->tx.n_buffers_on_ring && (dq->head_index != dq->tail_index)) + { + hwbp_race++; + if (IXGE_HWBP_RACE_ELOG && (hwbp_race == 1)) + { + ELOG_TYPE_DECLARE (e) = + { + .function = (char *) __FUNCTION__,.format = + "ixge %d tx head index race: head %4d, tail %4d, buffs %4d",.format_args + = "i4i4i4i4",}; + struct + { + u32 instance, head_index, tail_index, n_buffers_on_ring; + } *ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->instance = xd->device_index; + ed->head_index = dq->head_index; + ed->tail_index = dq->tail_index; + ed->n_buffers_on_ring = dq->tx.n_buffers_on_ring; + } + } + + dq->head_index = dq->tx.head_index_write_back[0]; + n_hw_owned_descriptors = ixge_ring_sub (dq, dq->head_index, dq->tail_index); + ASSERT (dq->tx.n_buffers_on_ring >= n_hw_owned_descriptors); + n_clean = dq->tx.n_buffers_on_ring - n_hw_owned_descriptors; + + if (IXGE_HWBP_RACE_ELOG && hwbp_race) + { + ELOG_TYPE_DECLARE (e) = + { + .function = (char *) __FUNCTION__,.format = + "ixge %d tx head index race: head %4d, hw_owned %4d, n_clean %4d, retries %d",.format_args + = "i4i4i4i4i4",}; + struct + { + u32 instance, head_index, n_hw_owned_descriptors, n_clean, retries; + } *ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->instance = xd->device_index; + ed->head_index = dq->head_index; + ed->n_hw_owned_descriptors = n_hw_owned_descriptors; + ed->n_clean = n_clean; + ed->retries = hwbp_race; + } + + /* + * This function used to wait until hardware owned zero descriptors. + * At high PPS rates, that doesn't happen until the TX ring is + * completely full of descriptors which need to be cleaned up. + * That, in turn, causes TX ring-full drops and/or long RX service + * interruptions. + */ + if (n_clean == 0) + return; + + /* Clean the n_clean descriptors prior to the reported hardware head */ + last_to_clean = dq->head_index - 1; + last_to_clean = (last_to_clean < 0) ? last_to_clean + dq->n_descriptors : + last_to_clean; + + first_to_clean = (last_to_clean) - (n_clean - 1); + first_to_clean = (first_to_clean < 0) ? first_to_clean + dq->n_descriptors : + first_to_clean; + + vec_resize (xm->tx_buffers_pending_free, dq->n_descriptors - 1); + t0 = t = xm->tx_buffers_pending_free; + b = dq->descriptor_buffer_indices + first_to_clean; + + /* Wrap case: clean from first to end, then start to last */ + if (first_to_clean > last_to_clean) + { + t += clean_block (b, t, (dq->n_descriptors - 1) - first_to_clean); + first_to_clean = 0; + b = dq->descriptor_buffer_indices; + } + + /* Typical case: clean from first to last */ + if (first_to_clean <= last_to_clean) + t += clean_block (b, t, (last_to_clean - first_to_clean) + 1); + + if (t > t0) + { + u32 n = t - t0; + vlib_buffer_free_no_next (vm, t0, n); + ASSERT (dq->tx.n_buffers_on_ring >= n); + dq->tx.n_buffers_on_ring -= n; + _vec_len (xm->tx_buffers_pending_free) = 0; + } +} + +/* RX queue interrupts 0 thru 7; TX 8 thru 15. */ +always_inline uword +ixge_interrupt_is_rx_queue (uword i) +{ + return i < 8; +} + +always_inline uword +ixge_interrupt_is_tx_queue (uword i) +{ + return i >= 8 && i < 16; +} + +always_inline uword +ixge_tx_queue_to_interrupt (uword i) +{ + return 8 + i; +} + +always_inline uword +ixge_rx_queue_to_interrupt (uword i) +{ + return 0 + i; +} + +always_inline uword +ixge_interrupt_rx_queue (uword i) +{ + ASSERT (ixge_interrupt_is_rx_queue (i)); + return i - 0; +} + +always_inline uword +ixge_interrupt_tx_queue (uword i) +{ + ASSERT (ixge_interrupt_is_tx_queue (i)); + return i - 8; +} + +static uword +ixge_device_input (ixge_main_t * xm, + ixge_device_t * xd, vlib_node_runtime_t * node) +{ + ixge_regs_t *r = xd->regs; + u32 i, s; + uword n_rx_packets = 0; + + s = r->interrupt.status_write_1_to_set; + if (s) + r->interrupt.status_write_1_to_clear = s; + + /* *INDENT-OFF* */ + foreach_set_bit (i, s, ({ + if (ixge_interrupt_is_rx_queue (i)) + n_rx_packets += ixge_rx_queue (xm, xd, node, ixge_interrupt_rx_queue (i)); + + else if (ixge_interrupt_is_tx_queue (i)) + ixge_tx_queue (xm, xd, ixge_interrupt_tx_queue (i)); + + else + ixge_interrupt (xm, xd, i); + })); + /* *INDENT-ON* */ + + return n_rx_packets; +} + +static uword +ixge_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * f) +{ + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd; + uword n_rx_packets = 0; + + if (node->state == VLIB_NODE_STATE_INTERRUPT) + { + uword i; + + /* Loop over devices with interrupts. */ + /* *INDENT-OFF* */ + foreach_set_bit (i, node->runtime_data[0], ({ + xd = vec_elt_at_index (xm->devices, i); + n_rx_packets += ixge_device_input (xm, xd, node); + + /* Re-enable interrupts since we're going to stay in interrupt mode. */ + if (! (node->flags & VLIB_NODE_FLAG_SWITCH_FROM_INTERRUPT_TO_POLLING_MODE)) + xd->regs->interrupt.enable_write_1_to_set = ~0; + })); + /* *INDENT-ON* */ + + /* Clear mask of devices with pending interrupts. */ + node->runtime_data[0] = 0; + } + else + { + /* Poll all devices for input/interrupts. */ + vec_foreach (xd, xm->devices) + { + n_rx_packets += ixge_device_input (xm, xd, node); + + /* Re-enable interrupts when switching out of polling mode. */ + if (node->flags & + VLIB_NODE_FLAG_SWITCH_FROM_POLLING_TO_INTERRUPT_MODE) + xd->regs->interrupt.enable_write_1_to_set = ~0; + } + } + + return n_rx_packets; +} + +static char *ixge_error_strings[] = { +#define _(n,s) s, + foreach_ixge_error +#undef _ +}; + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (ixge_input_node, static) = { + .function = ixge_input, + .type = VLIB_NODE_TYPE_INPUT, + .name = "ixge-input", + + /* Will be enabled if/when hardware is detected. */ + .state = VLIB_NODE_STATE_DISABLED, + + .format_buffer = format_ethernet_header_with_length, + .format_trace = format_ixge_rx_dma_trace, + + .n_errors = IXGE_N_ERROR, + .error_strings = ixge_error_strings, + + .n_next_nodes = IXGE_RX_N_NEXT, + .next_nodes = { + [IXGE_RX_NEXT_DROP] = "error-drop", + [IXGE_RX_NEXT_ETHERNET_INPUT] = "ethernet-input", + [IXGE_RX_NEXT_IP4_INPUT] = "ip4-input", + [IXGE_RX_NEXT_IP6_INPUT] = "ip6-input", + }, +}; + +VLIB_NODE_FUNCTION_MULTIARCH_CLONE (ixge_input) +CLIB_MULTIARCH_SELECT_FN (ixge_input) +/* *INDENT-ON* */ + +static u8 * +format_ixge_device_name (u8 * s, va_list * args) +{ + u32 i = va_arg (*args, u32); + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd = vec_elt_at_index (xm->devices, i); + return format (s, "TenGigabitEthernet%U", + format_vlib_pci_handle, &xd->pci_device.bus_address); +} + +#define IXGE_COUNTER_IS_64_BIT (1 << 0) +#define IXGE_COUNTER_NOT_CLEAR_ON_READ (1 << 1) + +static u8 ixge_counter_flags[] = { +#define _(a,f) 0, +#define _64(a,f) IXGE_COUNTER_IS_64_BIT, + foreach_ixge_counter +#undef _ +#undef _64 +}; + +static void +ixge_update_counters (ixge_device_t * xd) +{ + /* Byte offset for counter registers. */ + static u32 reg_offsets[] = { +#define _(a,f) (a) / sizeof (u32), +#define _64(a,f) _(a,f) + foreach_ixge_counter +#undef _ +#undef _64 + }; + volatile u32 *r = (volatile u32 *) xd->regs; + int i; + + for (i = 0; i < ARRAY_LEN (xd->counters); i++) + { + u32 o = reg_offsets[i]; + xd->counters[i] += r[o]; + if (ixge_counter_flags[i] & IXGE_COUNTER_NOT_CLEAR_ON_READ) + r[o] = 0; + if (ixge_counter_flags[i] & IXGE_COUNTER_IS_64_BIT) + xd->counters[i] += (u64) r[o + 1] << (u64) 32; + } +} + +static u8 * +format_ixge_device_id (u8 * s, va_list * args) +{ + u32 device_id = va_arg (*args, u32); + char *t = 0; + switch (device_id) + { +#define _(f,n) case n: t = #f; break; + foreach_ixge_pci_device_id; +#undef _ + default: + t = 0; + break; + } + if (t == 0) + s = format (s, "unknown 0x%x", device_id); + else + s = format (s, "%s", t); + return s; +} + +static u8 * +format_ixge_link_status (u8 * s, va_list * args) +{ + ixge_device_t *xd = va_arg (*args, ixge_device_t *); + u32 v = xd->link_status_at_last_link_change; + + s = format (s, "%s", (v & (1 << 30)) ? "up" : "down"); + + { + char *modes[] = { + "1g", "10g parallel", "10g serial", "autoneg", + }; + char *speeds[] = { + "unknown", "100m", "1g", "10g", + }; + s = format (s, ", mode %s, speed %s", + modes[(v >> 26) & 3], speeds[(v >> 28) & 3]); + } + + return s; +} + +static u8 * +format_ixge_device (u8 * s, va_list * args) +{ + u32 dev_instance = va_arg (*args, u32); + CLIB_UNUSED (int verbose) = va_arg (*args, int); + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd = vec_elt_at_index (xm->devices, dev_instance); + ixge_phy_t *phy = xd->phys + xd->phy_index; + uword indent = format_get_indent (s); + + ixge_update_counters (xd); + xd->link_status_at_last_link_change = xd->regs->xge_mac.link_status; + + s = format (s, "Intel 8259X: id %U\n%Ulink %U", + format_ixge_device_id, xd->device_id, + format_white_space, indent + 2, format_ixge_link_status, xd); + + { + + s = format (s, "\n%UPCIe %U", format_white_space, indent + 2, + format_vlib_pci_link_speed, &xd->pci_device); + } + + s = format (s, "\n%U", format_white_space, indent + 2); + if (phy->mdio_address != ~0) + s = format (s, "PHY address %d, id 0x%x", phy->mdio_address, phy->id); + else if (xd->sfp_eeprom.id == SFP_ID_sfp) + s = format (s, "SFP %U", format_sfp_eeprom, &xd->sfp_eeprom); + else + s = format (s, "PHY not found"); + + /* FIXME */ + { + ixge_dma_queue_t *dq = vec_elt_at_index (xd->dma_queues[VLIB_RX], 0); + ixge_dma_regs_t *dr = get_dma_regs (xd, VLIB_RX, 0); + u32 hw_head_index = dr->head_index; + u32 sw_head_index = dq->head_index; + u32 nitems; + + nitems = ixge_ring_sub (dq, hw_head_index, sw_head_index); + s = format (s, "\n%U%d unprocessed, %d total buffers on rx queue 0 ring", + format_white_space, indent + 2, nitems, dq->n_descriptors); + + s = format (s, "\n%U%d buffers in driver rx cache", + format_white_space, indent + 2, + vec_len (xm->rx_buffers_to_add)); + + s = format (s, "\n%U%d buffers on tx queue 0 ring", + format_white_space, indent + 2, + xd->dma_queues[VLIB_TX][0].tx.n_buffers_on_ring); + } + { + u32 i; + u64 v; + static char *names[] = { +#define _(a,f) #f, +#define _64(a,f) _(a,f) + foreach_ixge_counter +#undef _ +#undef _64 + }; + + for (i = 0; i < ARRAY_LEN (names); i++) + { + v = xd->counters[i] - xd->counters_last_clear[i]; + if (v != 0) + s = format (s, "\n%U%-40U%16Ld", + format_white_space, indent + 2, + format_c_identifier, names[i], v); + } + } + + return s; +} + +static void +ixge_clear_hw_interface_counters (u32 instance) +{ + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd = vec_elt_at_index (xm->devices, instance); + ixge_update_counters (xd); + memcpy (xd->counters_last_clear, xd->counters, sizeof (xd->counters)); +} + +/* + * Dynamically redirect all pkts from a specific interface + * to the specified node + */ +static void +ixge_set_interface_next_node (vnet_main_t * vnm, u32 hw_if_index, + u32 node_index) +{ + ixge_main_t *xm = &ixge_main; + vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); + ixge_device_t *xd = vec_elt_at_index (xm->devices, hw->dev_instance); + + /* Shut off redirection */ + if (node_index == ~0) + { + xd->per_interface_next_index = node_index; + return; + } + + xd->per_interface_next_index = + vlib_node_add_next (xm->vlib_main, ixge_input_node.index, node_index); +} + + +/* *INDENT-OFF* */ +VNET_DEVICE_CLASS (ixge_device_class) = { + .name = "ixge", + .tx_function = ixge_interface_tx, + .format_device_name = format_ixge_device_name, + .format_device = format_ixge_device, + .format_tx_trace = format_ixge_tx_dma_trace, + .clear_counters = ixge_clear_hw_interface_counters, + .admin_up_down_function = ixge_interface_admin_up_down, + .rx_redirect_to_node = ixge_set_interface_next_node, +}; +/* *INDENT-ON* */ + +#define IXGE_N_BYTES_IN_RX_BUFFER (2048) // DAW-HACK: Set Rx buffer size so all packets < ETH_MTU_SIZE fit in the buffer (i.e. sop & eop for all descriptors). + +static clib_error_t * +ixge_dma_init (ixge_device_t * xd, vlib_rx_or_tx_t rt, u32 queue_index) +{ + ixge_main_t *xm = &ixge_main; + vlib_main_t *vm = xm->vlib_main; + ixge_dma_queue_t *dq; + clib_error_t *error = 0; + + vec_validate (xd->dma_queues[rt], queue_index); + dq = vec_elt_at_index (xd->dma_queues[rt], queue_index); + + if (!xm->n_descriptors_per_cache_line) + xm->n_descriptors_per_cache_line = + CLIB_CACHE_LINE_BYTES / sizeof (dq->descriptors[0]); + + if (!xm->n_bytes_in_rx_buffer) + xm->n_bytes_in_rx_buffer = IXGE_N_BYTES_IN_RX_BUFFER; + xm->n_bytes_in_rx_buffer = round_pow2 (xm->n_bytes_in_rx_buffer, 1024); + if (!xm->vlib_buffer_free_list_index) + { + xm->vlib_buffer_free_list_index = + vlib_buffer_get_or_create_free_list (vm, xm->n_bytes_in_rx_buffer, + "ixge rx"); + ASSERT (xm->vlib_buffer_free_list_index != 0); + } + + if (!xm->n_descriptors[rt]) + xm->n_descriptors[rt] = 4 * VLIB_FRAME_SIZE; + + dq->queue_index = queue_index; + dq->n_descriptors = + round_pow2 (xm->n_descriptors[rt], xm->n_descriptors_per_cache_line); + dq->head_index = dq->tail_index = 0; + + dq->descriptors = + vlib_physmem_alloc_aligned (vm, xm->physmem_region, &error, + dq->n_descriptors * + sizeof (dq->descriptors[0]), + 128 /* per chip spec */ ); + if (error) + return error; + + memset (dq->descriptors, 0, + dq->n_descriptors * sizeof (dq->descriptors[0])); + vec_resize (dq->descriptor_buffer_indices, dq->n_descriptors); + + if (rt == VLIB_RX) + { + u32 n_alloc, i; + + n_alloc = vlib_buffer_alloc_from_free_list + (vm, dq->descriptor_buffer_indices, + vec_len (dq->descriptor_buffer_indices), + xm->vlib_buffer_free_list_index); + ASSERT (n_alloc == vec_len (dq->descriptor_buffer_indices)); + for (i = 0; i < n_alloc; i++) + { + vlib_buffer_t *b = + vlib_get_buffer (vm, dq->descriptor_buffer_indices[i]); + dq->descriptors[i].rx_to_hw.tail_address = + vlib_physmem_virtual_to_physical (vm, xm->physmem_region, + b->data); + } + } + else + { + u32 i; + + dq->tx.head_index_write_back = + vlib_physmem_alloc (vm, vm->buffer_main->physmem_region, &error, + CLIB_CACHE_LINE_BYTES); + + for (i = 0; i < dq->n_descriptors; i++) + dq->descriptors[i].tx = xm->tx_descriptor_template; + + vec_validate (xm->tx_buffers_pending_free, dq->n_descriptors - 1); + } + + { + ixge_dma_regs_t *dr = get_dma_regs (xd, rt, queue_index); + u64 a; + + a = + vlib_physmem_virtual_to_physical (vm, vm->buffer_main->physmem_region, + dq->descriptors); + dr->descriptor_address[0] = a & 0xFFFFFFFF; + dr->descriptor_address[1] = a >> (u64) 32; + dr->n_descriptor_bytes = dq->n_descriptors * sizeof (dq->descriptors[0]); + dq->head_index = dq->tail_index = 0; + + if (rt == VLIB_RX) + { + ASSERT ((xm->n_bytes_in_rx_buffer / 1024) < 32); + dr->rx_split_control = + ( /* buffer size */ ((xm->n_bytes_in_rx_buffer / 1024) << 0) + | ( /* lo free descriptor threshold (units of 64 descriptors) */ + (1 << 22)) | ( /* descriptor type: advanced one buffer */ + (1 << 25)) | ( /* drop if no descriptors available */ + (1 << 28))); + + /* Give hardware all but last 16 cache lines' worth of descriptors. */ + dq->tail_index = dq->n_descriptors - + 16 * xm->n_descriptors_per_cache_line; + } + else + { + /* Make sure its initialized before hardware can get to it. */ + dq->tx.head_index_write_back[0] = dq->head_index; + + a = + vlib_physmem_virtual_to_physical (vm, + vm->buffer_main->physmem_region, + dq->tx.head_index_write_back); + dr->tx.head_index_write_back_address[0] = /* enable bit */ 1 | a; + dr->tx.head_index_write_back_address[1] = (u64) a >> (u64) 32; + } + + /* DMA on 82599 does not work with [13] rx data write relaxed ordering + and [12] undocumented set. */ + if (rt == VLIB_RX) + dr->dca_control &= ~((1 << 13) | (1 << 12)); + + CLIB_MEMORY_BARRIER (); + + if (rt == VLIB_TX) + { + xd->regs->tx_dma_control |= (1 << 0); + dr->control |= ((32 << 0) /* prefetch threshold */ + | (64 << 8) /* host threshold */ + | (0 << 16) /* writeback threshold */ ); + } + + /* Enable this queue and wait for hardware to initialize + before adding to tail. */ + if (rt == VLIB_TX) + { + dr->control |= 1 << 25; + while (!(dr->control & (1 << 25))) + ; + } + + /* Set head/tail indices and enable DMA. */ + dr->head_index = dq->head_index; + dr->tail_index = dq->tail_index; + } + + return error; +} + +static u32 +ixge_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hw, u32 flags) +{ + ixge_device_t *xd; + ixge_regs_t *r; + u32 old; + ixge_main_t *xm = &ixge_main; + + xd = vec_elt_at_index (xm->devices, hw->dev_instance); + r = xd->regs; + + old = r->filter_control; + + if (flags & ETHERNET_INTERFACE_FLAG_ACCEPT_ALL) + r->filter_control = old | (1 << 9) /* unicast promiscuous */ ; + else + r->filter_control = old & ~(1 << 9); + + return old; +} + +static void +ixge_device_init (ixge_main_t * xm) +{ + vnet_main_t *vnm = vnet_get_main (); + ixge_device_t *xd; + + /* Reset chip(s). */ + vec_foreach (xd, xm->devices) + { + ixge_regs_t *r = xd->regs; + const u32 reset_bit = (1 << 26) | (1 << 3); + + r->control |= reset_bit; + + /* No need to suspend. Timed to take ~1e-6 secs */ + while (r->control & reset_bit) + ; + + /* Software loaded. */ + r->extended_control |= (1 << 28); + + ixge_phy_init (xd); + + /* Register ethernet interface. */ + { + u8 addr8[6]; + u32 i, addr32[2]; + clib_error_t *error; + + addr32[0] = r->rx_ethernet_address0[0][0]; + addr32[1] = r->rx_ethernet_address0[0][1]; + for (i = 0; i < 6; i++) + addr8[i] = addr32[i / 4] >> ((i % 4) * 8); + + error = ethernet_register_interface + (vnm, ixge_device_class.index, xd->device_index, + /* ethernet address */ addr8, + &xd->vlib_hw_if_index, ixge_flag_change); + if (error) + clib_error_report (error); + } + + { + vnet_sw_interface_t *sw = + vnet_get_hw_sw_interface (vnm, xd->vlib_hw_if_index); + xd->vlib_sw_if_index = sw->sw_if_index; + } + + ixge_dma_init (xd, VLIB_RX, /* queue_index */ 0); + + xm->n_descriptors[VLIB_TX] = 20 * VLIB_FRAME_SIZE; + + ixge_dma_init (xd, VLIB_TX, /* queue_index */ 0); + + /* RX/TX queue 0 gets mapped to interrupt bits 0 & 8. */ + r->interrupt.queue_mapping[0] = (( /* valid bit */ (1 << 7) | + ixge_rx_queue_to_interrupt (0)) << 0); + + r->interrupt.queue_mapping[0] |= (( /* valid bit */ (1 << 7) | + ixge_tx_queue_to_interrupt (0)) << 8); + + /* No use in getting too many interrupts. + Limit them to one every 3/4 ring size at line rate + min sized packets. + No need for this since kernel/vlib main loop provides adequate interrupt + limiting scheme. */ + if (0) + { + f64 line_rate_max_pps = + 10e9 / (8 * (64 + /* interframe padding */ 20)); + ixge_throttle_queue_interrupt (r, 0, + .75 * xm->n_descriptors[VLIB_RX] / + line_rate_max_pps); + } + + /* Accept all multicast and broadcast packets. Should really add them + to the dst_ethernet_address register array. */ + r->filter_control |= (1 << 10) | (1 << 8); + + /* Enable frames up to size in mac frame size register. */ + r->xge_mac.control |= 1 << 2; + r->xge_mac.rx_max_frame_size = (9216 + 14) << 16; + + /* Enable all interrupts. */ + if (!IXGE_ALWAYS_POLL) + r->interrupt.enable_write_1_to_set = ~0; + } +} + +static uword +ixge_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f) +{ + vnet_main_t *vnm = vnet_get_main (); + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd; + uword event_type, *event_data = 0; + f64 timeout, link_debounce_deadline; + + ixge_device_init (xm); + + /* Clear all counters. */ + vec_foreach (xd, xm->devices) + { + ixge_update_counters (xd); + memset (xd->counters, 0, sizeof (xd->counters)); + } + + timeout = 30.0; + link_debounce_deadline = 1e70; + + while (1) + { + /* 36 bit stat counters could overflow in ~50 secs. + We poll every 30 secs to be conservative. */ + vlib_process_wait_for_event_or_clock (vm, timeout); + + event_type = vlib_process_get_events (vm, &event_data); + + switch (event_type) + { + case EVENT_SET_FLAGS: + /* 1 ms */ + link_debounce_deadline = vlib_time_now (vm) + 1e-3; + timeout = 1e-3; + break; + + case ~0: + /* No events found: timer expired. */ + if (vlib_time_now (vm) > link_debounce_deadline) + { + vec_foreach (xd, xm->devices) + { + ixge_regs_t *r = xd->regs; + u32 v = r->xge_mac.link_status; + uword is_up = (v & (1 << 30)) != 0; + + vnet_hw_interface_set_flags + (vnm, xd->vlib_hw_if_index, + is_up ? VNET_HW_INTERFACE_FLAG_LINK_UP : 0); + } + link_debounce_deadline = 1e70; + timeout = 30.0; + } + break; + + default: + ASSERT (0); + } + + if (event_data) + _vec_len (event_data) = 0; + + /* Query stats every 30 secs. */ + { + f64 now = vlib_time_now (vm); + if (now - xm->time_last_stats_update > 30) + { + xm->time_last_stats_update = now; + vec_foreach (xd, xm->devices) ixge_update_counters (xd); + } + } + } + + return 0; +} + +static vlib_node_registration_t ixge_process_node = { + .function = ixge_process, + .type = VLIB_NODE_TYPE_PROCESS, + .name = "ixge-process", +}; + +clib_error_t * +ixge_init (vlib_main_t * vm) +{ + ixge_main_t *xm = &ixge_main; + clib_error_t *error; + + xm->vlib_main = vm; + memset (&xm->tx_descriptor_template, 0, + sizeof (xm->tx_descriptor_template)); + memset (&xm->tx_descriptor_template_mask, 0, + sizeof (xm->tx_descriptor_template_mask)); + xm->tx_descriptor_template.status0 = + (IXGE_TX_DESCRIPTOR_STATUS0_ADVANCED | + IXGE_TX_DESCRIPTOR_STATUS0_IS_ADVANCED | + IXGE_TX_DESCRIPTOR_STATUS0_INSERT_FCS); + xm->tx_descriptor_template_mask.status0 = 0xffff; + xm->tx_descriptor_template_mask.status1 = 0x00003fff; + + xm->tx_descriptor_template_mask.status0 &= + ~(IXGE_TX_DESCRIPTOR_STATUS0_IS_END_OF_PACKET + | IXGE_TX_DESCRIPTOR_STATUS0_REPORT_STATUS); + xm->tx_descriptor_template_mask.status1 &= + ~(IXGE_TX_DESCRIPTOR_STATUS1_DONE); + + error = vlib_call_init_function (vm, pci_bus_init); + + return error; +} + +VLIB_INIT_FUNCTION (ixge_init); + + +static void +ixge_pci_intr_handler (vlib_pci_device_t * dev) +{ + ixge_main_t *xm = &ixge_main; + vlib_main_t *vm = xm->vlib_main; + + vlib_node_set_interrupt_pending (vm, ixge_input_node.index); + + /* Let node know which device is interrupting. */ + { + vlib_node_runtime_t *rt = + vlib_node_get_runtime (vm, ixge_input_node.index); + rt->runtime_data[0] |= 1 << dev->private_data; + } +} + +static clib_error_t * +ixge_pci_init (vlib_main_t * vm, vlib_pci_device_t * dev) +{ + ixge_main_t *xm = &ixge_main; + clib_error_t *error; + void *r; + ixge_device_t *xd; + + /* Allocate physmem region for DMA buffers */ + error = vlib_physmem_region_alloc (vm, "ixge decriptors", 2 << 20, 0, + VLIB_PHYSMEM_F_INIT_MHEAP, + &xm->physmem_region); + if (error) + return error; + + error = vlib_pci_map_resource (dev, 0, &r); + if (error) + return error; + + vec_add2 (xm->devices, xd, 1); + + if (vec_len (xm->devices) == 1) + { + ixge_input_node.function = ixge_input_multiarch_select (); + } + + xd->pci_device = dev[0]; + xd->device_id = xd->pci_device.config0.header.device_id; + xd->regs = r; + xd->device_index = xd - xm->devices; + xd->pci_function = dev->bus_address.function; + xd->per_interface_next_index = ~0; + + + /* Chip found so enable node. */ + { + vlib_node_set_state (vm, ixge_input_node.index, + (IXGE_ALWAYS_POLL + ? VLIB_NODE_STATE_POLLING + : VLIB_NODE_STATE_INTERRUPT)); + + dev->private_data = xd->device_index; + } + + if (vec_len (xm->devices) == 1) + { + vlib_register_node (vm, &ixge_process_node); + xm->process_node_index = ixge_process_node.index; + } + + error = vlib_pci_bus_master_enable (dev); + + if (error) + return error; + + return vlib_pci_intr_enable (dev); +} + +/* *INDENT-OFF* */ +PCI_REGISTER_DEVICE (ixge_pci_device_registration,static) = { + .init_function = ixge_pci_init, + .interrupt_handler = ixge_pci_intr_handler, + .supported_devices = { +#define _(t,i) { .vendor_id = PCI_VENDOR_ID_INTEL, .device_id = i, }, + foreach_ixge_pci_device_id +#undef _ + { 0 }, + }, +}; +/* *INDENT-ON* */ + +void +ixge_set_next_node (ixge_rx_next_t next, char *name) +{ + vlib_node_registration_t *r = &ixge_input_node; + + switch (next) + { + case IXGE_RX_NEXT_IP4_INPUT: + case IXGE_RX_NEXT_IP6_INPUT: + case IXGE_RX_NEXT_ETHERNET_INPUT: + r->next_nodes[next] = name; + break; + + default: + clib_warning ("%s: illegal next %d\n", __FUNCTION__, next); + break; + } +} + +/* *INDENT-OFF* */ +VLIB_PLUGIN_REGISTER () = { + .version = VPP_BUILD_VER, + .default_disabled = 1, + .description = "Intel 82599 Family Native Driver (experimental)", +}; +#endif + +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ixge/ixge.h b/src/plugins/ixge/ixge.h new file mode 100644 index 00000000..42c1bfa5 --- /dev/null +++ b/src/plugins/ixge/ixge.h @@ -0,0 +1,1295 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef included_ixge_h +#define included_ixge_h + +#include <vnet/vnet.h> +#include <vlib/pci/pci.h> +#include <vlib/i2c.h> +#include <vnet/ethernet/sfp.h> +#include <vnet/ip/ip4_packet.h> +#include <vnet/ip/ip6_packet.h> + +typedef volatile struct +{ + /* [31:7] 128 byte aligned. */ + u32 descriptor_address[2]; + u32 n_descriptor_bytes; + + /* [5] rx/tx descriptor dca enable + [6] rx packet head dca enable + [7] rx packet tail dca enable + [9] rx/tx descriptor relaxed order + [11] rx/tx descriptor write back relaxed order + [13] rx/tx data write/read relaxed order + [15] rx head data write relaxed order + [31:24] apic id for cpu's cache. */ + u32 dca_control; + + u32 head_index; + + /* [4:0] tail buffer size (in 1k byte units) + [13:8] head buffer size (in 64 byte units) + [24:22] lo free descriptors threshold (units of 64 descriptors) + [27:25] descriptor type 0 = legacy, 1 = advanced one buffer (e.g. tail), + 2 = advanced header splitting (head + tail), 5 = advanced header + splitting (head only). + [28] drop if no descriptors available. */ + u32 rx_split_control; + + u32 tail_index; + CLIB_PAD_FROM_TO (0x1c, 0x28); + + /* [7:0] rx/tx prefetch threshold + [15:8] rx/tx host threshold + [24:16] rx/tx write back threshold + [25] rx/tx enable + [26] tx descriptor writeback flush + [30] rx strip vlan enable */ + u32 control; + + u32 rx_coallesce_control; + + union + { + struct + { + /* packets bytes lo hi */ + u32 stats[3]; + + u32 unused; + } rx; + + struct + { + u32 unused[2]; + + /* [0] enables head write back. */ + u32 head_index_write_back_address[2]; + } tx; + }; +} ixge_dma_regs_t; + +/* Only advanced descriptors are supported. */ +typedef struct +{ + u64 tail_address; + u64 head_address; +} ixge_rx_to_hw_descriptor_t; + +typedef struct +{ + u32 status[3]; + u16 n_packet_bytes_this_descriptor; + u16 vlan_tag; +} ixge_rx_from_hw_descriptor_t; + +#define IXGE_RX_DESCRIPTOR_STATUS0_IS_LAYER2 (1 << (4 + 11)) +/* Valid if not layer2. */ +#define IXGE_RX_DESCRIPTOR_STATUS0_IS_IP4 (1 << (4 + 0)) +#define IXGE_RX_DESCRIPTOR_STATUS0_IS_IP4_EXT (1 << (4 + 1)) +#define IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6 (1 << (4 + 2)) +#define IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6_EXT (1 << (4 + 3)) +#define IXGE_RX_DESCRIPTOR_STATUS0_IS_TCP (1 << (4 + 4)) +#define IXGE_RX_DESCRIPTOR_STATUS0_IS_UDP (1 << (4 + 5)) +#define IXGE_RX_DESCRIPTOR_STATUS0_L3_OFFSET(s) (((s) >> 21) & 0x3ff) + +#define IXGE_RX_DESCRIPTOR_STATUS2_IS_OWNED_BY_SOFTWARE (1 << (0 + 0)) +#define IXGE_RX_DESCRIPTOR_STATUS2_IS_END_OF_PACKET (1 << (0 + 1)) +#define IXGE_RX_DESCRIPTOR_STATUS2_IS_VLAN (1 << (0 + 3)) +#define IXGE_RX_DESCRIPTOR_STATUS2_IS_UDP_CHECKSUMMED (1 << (0 + 4)) +#define IXGE_RX_DESCRIPTOR_STATUS2_IS_TCP_CHECKSUMMED (1 << (0 + 5)) +#define IXGE_RX_DESCRIPTOR_STATUS2_IS_IP4_CHECKSUMMED (1 << (0 + 6)) +#define IXGE_RX_DESCRIPTOR_STATUS2_NOT_UNICAST (1 << (0 + 7)) +#define IXGE_RX_DESCRIPTOR_STATUS2_IS_DOUBLE_VLAN (1 << (0 + 9)) +#define IXGE_RX_DESCRIPTOR_STATUS2_UDP_CHECKSUM_ERROR (1 << (0 + 10)) +#define IXGE_RX_DESCRIPTOR_STATUS2_ETHERNET_ERROR (1 << (20 + 9)) +#define IXGE_RX_DESCRIPTOR_STATUS2_TCP_CHECKSUM_ERROR (1 << (20 + 10)) +#define IXGE_RX_DESCRIPTOR_STATUS2_IP4_CHECKSUM_ERROR (1 << (20 + 11)) + +/* For layer2 packets stats0 bottom 3 bits give ether type index from filter. */ +#define IXGE_RX_DESCRIPTOR_STATUS0_LAYER2_ETHERNET_TYPE(s) ((s) & 7) + +typedef struct +{ + u64 buffer_address; + u16 n_bytes_this_buffer; + u16 status0; + u32 status1; +#define IXGE_TX_DESCRIPTOR_STATUS0_ADVANCED (3 << 4) +#define IXGE_TX_DESCRIPTOR_STATUS0_IS_ADVANCED (1 << (8 + 5)) +#define IXGE_TX_DESCRIPTOR_STATUS0_LOG2_REPORT_STATUS (8 + 3) +#define IXGE_TX_DESCRIPTOR_STATUS0_REPORT_STATUS (1 << IXGE_TX_DESCRIPTOR_STATUS0_LOG2_REPORT_STATUS) +#define IXGE_TX_DESCRIPTOR_STATUS0_INSERT_FCS (1 << (8 + 1)) +#define IXGE_TX_DESCRIPTOR_STATUS0_LOG2_IS_END_OF_PACKET (8 + 0) +#define IXGE_TX_DESCRIPTOR_STATUS0_IS_END_OF_PACKET (1 << IXGE_TX_DESCRIPTOR_STATUS0_LOG2_IS_END_OF_PACKET) +#define IXGE_TX_DESCRIPTOR_STATUS1_DONE (1 << 0) +#define IXGE_TX_DESCRIPTOR_STATUS1_CONTEXT(i) (/* valid */ (1 << 7) | ((i) << 4)) +#define IXGE_TX_DESCRIPTOR_STATUS1_IPSEC_OFFLOAD (1 << (8 + 2)) +#define IXGE_TX_DESCRIPTOR_STATUS1_INSERT_TCP_UDP_CHECKSUM (1 << (8 + 1)) +#define IXGE_TX_DESCRIPTOR_STATUS1_INSERT_IP4_CHECKSUM (1 << (8 + 0)) +#define IXGE_TX_DESCRIPTOR_STATUS0_N_BYTES_THIS_BUFFER(l) ((l) << 0) +#define IXGE_TX_DESCRIPTOR_STATUS1_N_BYTES_IN_PACKET(l) ((l) << 14) +} ixge_tx_descriptor_t; + +typedef struct +{ + struct + { + u8 checksum_start_offset; + u8 checksum_insert_offset; + u16 checksum_end_offset; + } ip, tcp; + u32 status0; + + u8 status1; + + /* Byte offset after UDP/TCP header. */ + u8 payload_offset; + + u16 max_tcp_segment_size; +} __attribute__ ((packed)) ixge_tx_context_descriptor_t; + +typedef union +{ + ixge_rx_to_hw_descriptor_t rx_to_hw; + ixge_rx_from_hw_descriptor_t rx_from_hw; + ixge_tx_descriptor_t tx; + u32x4 as_u32x4; +} ixge_descriptor_t; + +typedef volatile struct +{ + /* [2] pcie master disable + [3] mac reset + [26] global device reset */ + u32 control; + u32 control_alias; + /* [3:2] device id (0 or 1 for dual port chips) + [7] link is up + [17:10] num vfs + [18] io active + [19] pcie master enable status */ + u32 status_read_only; + CLIB_PAD_FROM_TO (0xc, 0x18); + /* [14] pf reset done + [17] relaxed ordering disable + [26] extended vlan enable + [28] driver loaded */ + u32 extended_control; + CLIB_PAD_FROM_TO (0x1c, 0x20); + + /* software definable pins. + sdp_data [7:0] + sdp_is_output [15:8] + sdp_is_native [23:16] + sdp_function [31:24]. + */ + u32 sdp_control; + CLIB_PAD_FROM_TO (0x24, 0x28); + + /* [0] i2c clock in + [1] i2c clock out + [2] i2c data in + [3] i2c data out */ + u32 i2c_control; + CLIB_PAD_FROM_TO (0x2c, 0x4c); + u32 tcp_timer; + + CLIB_PAD_FROM_TO (0x50, 0x200); + + u32 led_control; + + CLIB_PAD_FROM_TO (0x204, 0x600); + u32 core_spare; + CLIB_PAD_FROM_TO (0x604, 0x700); + + struct + { + u32 vflr_events_clear[4]; + u32 mailbox_interrupt_status[4]; + u32 mailbox_interrupt_enable[4]; + CLIB_PAD_FROM_TO (0x730, 0x800); + } pf_foo; + + struct + { + u32 status_write_1_to_clear; + CLIB_PAD_FROM_TO (0x804, 0x808); + u32 status_write_1_to_set; + CLIB_PAD_FROM_TO (0x80c, 0x810); + u32 status_auto_clear_enable; + CLIB_PAD_FROM_TO (0x814, 0x820); + + /* [11:3] minimum inter-interrupt interval + (2e-6 units; 20e-6 units for fast ethernet). + [15] low-latency interrupt moderation enable + [20:16] low-latency interrupt credit + [27:21] interval counter + [31] write disable for credit and counter (write only). */ + u32 throttle0[24]; + + u32 enable_write_1_to_set; + CLIB_PAD_FROM_TO (0x884, 0x888); + u32 enable_write_1_to_clear; + CLIB_PAD_FROM_TO (0x88c, 0x890); + u32 enable_auto_clear; + u32 msi_to_eitr_select; + /* [3:0] spd 0-3 interrupt detection enable + [4] msi-x enable + [5] other clear disable (makes other bits in status not clear on read) + etc. */ + u32 control; + CLIB_PAD_FROM_TO (0x89c, 0x900); + + /* Defines interrupt mapping for 128 rx + 128 tx queues. + 64 x 4 8 bit entries. + For register [i]: + [5:0] bit in interrupt status for rx queue 2*i + 0 + [7] valid bit + [13:8] bit for tx queue 2*i + 0 + [15] valid bit + similar for rx 2*i + 1 and tx 2*i + 1. */ + u32 queue_mapping[64]; + + /* tcp timer [7:0] and other interrupts [15:8] */ + u32 misc_mapping; + CLIB_PAD_FROM_TO (0xa04, 0xa90); + + /* 64 interrupts determined by mappings. */ + u32 status1_write_1_to_clear[4]; + u32 enable1_write_1_to_set[4]; + u32 enable1_write_1_to_clear[4]; + CLIB_PAD_FROM_TO (0xac0, 0xad0); + u32 status1_enable_auto_clear[4]; + CLIB_PAD_FROM_TO (0xae0, 0x1000); + } interrupt; + + ixge_dma_regs_t rx_dma0[64]; + + CLIB_PAD_FROM_TO (0x2000, 0x2140); + u32 dcb_rx_packet_plane_t4_config[8]; + u32 dcb_rx_packet_plane_t4_status[8]; + CLIB_PAD_FROM_TO (0x2180, 0x2300); + + /* reg i defines mapping for 4 rx queues starting at 4*i + 0. */ + u32 rx_queue_stats_mapping[32]; + u32 rx_queue_stats_control; + + CLIB_PAD_FROM_TO (0x2384, 0x2410); + u32 fc_user_descriptor_ptr[2]; + u32 fc_buffer_control; + CLIB_PAD_FROM_TO (0x241c, 0x2420); + u32 fc_rx_dma; + CLIB_PAD_FROM_TO (0x2424, 0x2430); + u32 dcb_packet_plane_control; + CLIB_PAD_FROM_TO (0x2434, 0x2f00); + + u32 rx_dma_control; + u32 pf_queue_drop_enable; + CLIB_PAD_FROM_TO (0x2f08, 0x2f20); + u32 rx_dma_descriptor_cache_config; + CLIB_PAD_FROM_TO (0x2f24, 0x3000); + + /* 1 bit. */ + u32 rx_enable; + CLIB_PAD_FROM_TO (0x3004, 0x3008); + /* [15:0] ether type (little endian) + [31:16] opcode (big endian) */ + u32 flow_control_control; + CLIB_PAD_FROM_TO (0x300c, 0x3020); + /* 3 bit traffic class for each of 8 priorities. */ + u32 rx_priority_to_traffic_class; + CLIB_PAD_FROM_TO (0x3024, 0x3028); + u32 rx_coallesce_data_buffer_control; + CLIB_PAD_FROM_TO (0x302c, 0x3190); + u32 rx_packet_buffer_flush_detect; + CLIB_PAD_FROM_TO (0x3194, 0x3200); + u32 flow_control_tx_timers[4]; /* 2 timer values */ + CLIB_PAD_FROM_TO (0x3210, 0x3220); + u32 flow_control_rx_threshold_lo[8]; + CLIB_PAD_FROM_TO (0x3240, 0x3260); + u32 flow_control_rx_threshold_hi[8]; + CLIB_PAD_FROM_TO (0x3280, 0x32a0); + u32 flow_control_refresh_threshold; + CLIB_PAD_FROM_TO (0x32a4, 0x3c00); + /* For each of 8 traffic classes (units of bytes). */ + u32 rx_packet_buffer_size[8]; + CLIB_PAD_FROM_TO (0x3c20, 0x3d00); + u32 flow_control_config; + CLIB_PAD_FROM_TO (0x3d04, 0x4200); + + struct + { + u32 pcs_config; + CLIB_PAD_FROM_TO (0x4204, 0x4208); + u32 link_control; + u32 link_status; + u32 pcs_debug[2]; + u32 auto_negotiation; + u32 link_partner_ability; + u32 auto_negotiation_tx_next_page; + u32 auto_negotiation_link_partner_next_page; + CLIB_PAD_FROM_TO (0x4228, 0x4240); + } gige_mac; + + struct + { + /* [0] tx crc enable + [2] enable frames up to max frame size register [31:16] + [10] pad frames < 64 bytes if specified by user + [15] loopback enable + [16] mdc hi speed + [17] turn off mdc between mdio packets */ + u32 control; + + /* [5] rx symbol error (all bits clear on read) + [6] rx illegal symbol + [7] rx idle error + [8] rx local fault + [9] rx remote fault */ + u32 status; + + u32 pause_and_pace_control; + CLIB_PAD_FROM_TO (0x424c, 0x425c); + u32 phy_command; + u32 phy_data; + CLIB_PAD_FROM_TO (0x4264, 0x4268); + + /* [31:16] max frame size in bytes. */ + u32 rx_max_frame_size; + CLIB_PAD_FROM_TO (0x426c, 0x4288); + + /* [0] + [2] pcs receive link up? (latch lo) + [7] local fault + [1] + [0] pcs 10g base r capable + [1] pcs 10g base x capable + [2] pcs 10g base w capable + [10] rx local fault + [11] tx local fault + [15:14] 2 => device present at this address (else not present) */ + u32 xgxs_status[2]; + + u32 base_x_pcs_status; + + /* [0] pass unrecognized flow control frames + [1] discard pause frames + [2] rx priority flow control enable (only in dcb mode) + [3] rx flow control enable. */ + u32 flow_control; + + /* [3:0] tx lanes change polarity + [7:4] rx lanes change polarity + [11:8] swizzle tx lanes + [15:12] swizzle rx lanes + 4 x 2 bit tx lane swap + 4 x 2 bit rx lane swap. */ + u32 serdes_control; + + u32 fifo_control; + + /* [0] force link up + [1] autoneg ack2 bit to transmit + [6:2] autoneg selector field to transmit + [8:7] 10g pma/pmd type 0 => xaui, 1 kx4, 2 cx4 + [9] 1g pma/pmd type 0 => sfi, 1 => kx/bx + [10] disable 10g on without main power + [11] restart autoneg on transition to dx power state + [12] restart autoneg + [15:13] link mode: + 0 => 1g no autoneg + 1 => 10g kx4 parallel link no autoneg + 2 => 1g bx autoneg + 3 => 10g sfi serdes + 4 => kx4/kx/kr + 5 => xgmii 1g/100m + 6 => kx4/kx/kr 1g an + 7 kx4/kx/kr sgmii. + [16] kr support + [17] fec requested + [18] fec ability + etc. */ + u32 auto_negotiation_control; + + /* [0] signal detect 1g/100m + [1] fec signal detect + [2] 10g serial pcs fec block lock + [3] 10g serial high error rate + [4] 10g serial pcs block lock + [5] kx/kx4/kr autoneg next page received + [6] kx/kx4/kr backplane autoneg next page received + [7] link status clear to read + [11:8] 10g signal detect (4 lanes) (for serial just lane 0) + [12] 10g serial signal detect + [16:13] 10g parallel lane sync status + [17] 10g parallel align status + [18] 1g sync status + [19] kx/kx4/kr backplane autoneg is idle + [20] 1g autoneg enabled + [21] 1g pcs enabled for sgmii + [22] 10g xgxs enabled + [23] 10g serial fec enabled (forward error detection) + [24] 10g kr pcs enabled + [25] sgmii enabled + [27:26] mac link mode + 0 => 1g + 1 => 10g parallel + 2 => 10g serial + 3 => autoneg + [29:28] link speed + 1 => 100m + 2 => 1g + 3 => 10g + [30] link is up + [31] kx/kx4/kr backplane autoneg completed successfully. */ + u32 link_status; + + /* [17:16] pma/pmd for 10g serial + 0 => kr, 2 => sfi + [18] disable dme pages */ + u32 auto_negotiation_control2; + + CLIB_PAD_FROM_TO (0x42ac, 0x42b0); + u32 link_partner_ability[2]; + CLIB_PAD_FROM_TO (0x42b8, 0x42d0); + u32 manageability_control; + u32 link_partner_next_page[2]; + CLIB_PAD_FROM_TO (0x42dc, 0x42e0); + u32 kr_pcs_control; + u32 kr_pcs_status; + u32 fec_status[2]; + CLIB_PAD_FROM_TO (0x42f0, 0x4314); + u32 sgmii_control; + CLIB_PAD_FROM_TO (0x4318, 0x4324); + u32 link_status2; + CLIB_PAD_FROM_TO (0x4328, 0x4900); + } xge_mac; + + u32 tx_dcb_control; + u32 tx_dcb_descriptor_plane_queue_select; + u32 tx_dcb_descriptor_plane_t1_config; + u32 tx_dcb_descriptor_plane_t1_status; + CLIB_PAD_FROM_TO (0x4910, 0x4950); + + /* For each TC in units of 1k bytes. */ + u32 tx_packet_buffer_thresholds[8]; + CLIB_PAD_FROM_TO (0x4970, 0x4980); + struct + { + u32 mmw; + u32 config; + u32 status; + u32 rate_drift; + } dcb_tx_rate_scheduler; + CLIB_PAD_FROM_TO (0x4990, 0x4a80); + u32 tx_dma_control; + CLIB_PAD_FROM_TO (0x4a84, 0x4a88); + u32 tx_dma_tcp_flags_control[2]; + CLIB_PAD_FROM_TO (0x4a90, 0x4b00); + u32 pf_mailbox[64]; + CLIB_PAD_FROM_TO (0x4c00, 0x5000); + + /* RX */ + u32 checksum_control; + CLIB_PAD_FROM_TO (0x5004, 0x5008); + u32 rx_filter_control; + CLIB_PAD_FROM_TO (0x500c, 0x5010); + u32 management_vlan_tag[8]; + u32 management_udp_tcp_ports[8]; + CLIB_PAD_FROM_TO (0x5050, 0x5078); + /* little endian. */ + u32 extended_vlan_ether_type; + CLIB_PAD_FROM_TO (0x507c, 0x5080); + /* [1] store/dma bad packets + [8] accept all multicast + [9] accept all unicast + [10] accept all broadcast. */ + u32 filter_control; + CLIB_PAD_FROM_TO (0x5084, 0x5088); + /* [15:0] vlan ethernet type (0x8100) little endian + [28] cfi bit expected + [29] drop packets with unexpected cfi bit + [30] vlan filter enable. */ + u32 vlan_control; + CLIB_PAD_FROM_TO (0x508c, 0x5090); + /* [1:0] hi bit of ethernet address for 12 bit index into multicast table + 0 => 47, 1 => 46, 2 => 45, 3 => 43. + [2] enable multicast filter + */ + u32 multicast_control; + CLIB_PAD_FROM_TO (0x5094, 0x5100); + u32 fcoe_rx_control; + CLIB_PAD_FROM_TO (0x5104, 0x5108); + u32 fc_flt_context; + CLIB_PAD_FROM_TO (0x510c, 0x5110); + u32 fc_filter_control; + CLIB_PAD_FROM_TO (0x5114, 0x5120); + u32 rx_message_type_lo; + CLIB_PAD_FROM_TO (0x5124, 0x5128); + /* [15:0] ethernet type (little endian) + [18:16] matche pri in vlan tag + [19] priority match enable + [25:20] virtualization pool + [26] pool enable + [27] is fcoe + [30] ieee 1588 timestamp enable + [31] filter enable. + (See ethernet_type_queue_select.) */ + u32 ethernet_type_queue_filter[8]; + CLIB_PAD_FROM_TO (0x5148, 0x5160); + /* [7:0] l2 ethernet type and + [15:8] l2 ethernet type or */ + u32 management_decision_filters1[8]; + u32 vf_vm_tx_switch_loopback_enable[2]; + u32 rx_time_sync_control; + CLIB_PAD_FROM_TO (0x518c, 0x5190); + u32 management_ethernet_type_filters[4]; + u32 rx_timestamp_attributes_lo; + u32 rx_timestamp_hi; + u32 rx_timestamp_attributes_hi; + CLIB_PAD_FROM_TO (0x51ac, 0x51b0); + u32 pf_virtual_control; + CLIB_PAD_FROM_TO (0x51b4, 0x51d8); + u32 fc_offset_parameter; + CLIB_PAD_FROM_TO (0x51dc, 0x51e0); + u32 vf_rx_enable[2]; + u32 rx_timestamp_lo; + CLIB_PAD_FROM_TO (0x51ec, 0x5200); + /* 12 bits determined by multicast_control + lookup bits in this vector. */ + u32 multicast_enable[128]; + + /* [0] ethernet address [31:0] + [1] [15:0] ethernet address [47:32] + [31] valid bit. + Index 0 is read from eeprom after reset. */ + u32 rx_ethernet_address0[16][2]; + + CLIB_PAD_FROM_TO (0x5480, 0x5800); + u32 wake_up_control; + CLIB_PAD_FROM_TO (0x5804, 0x5808); + u32 wake_up_filter_control; + CLIB_PAD_FROM_TO (0x580c, 0x5818); + u32 multiple_rx_queue_command_82598; + CLIB_PAD_FROM_TO (0x581c, 0x5820); + u32 management_control; + u32 management_filter_control; + CLIB_PAD_FROM_TO (0x5828, 0x5838); + u32 wake_up_ip4_address_valid; + CLIB_PAD_FROM_TO (0x583c, 0x5840); + u32 wake_up_ip4_address_table[4]; + u32 management_control_to_host; + CLIB_PAD_FROM_TO (0x5854, 0x5880); + u32 wake_up_ip6_address_table[4]; + + /* unicast_and broadcast_and vlan_and ip_address_and + etc. */ + u32 management_decision_filters[8]; + + u32 management_ip4_or_ip6_address_filters[4][4]; + CLIB_PAD_FROM_TO (0x58f0, 0x5900); + u32 wake_up_packet_length; + CLIB_PAD_FROM_TO (0x5904, 0x5910); + u32 management_ethernet_address_filters[4][2]; + CLIB_PAD_FROM_TO (0x5930, 0x5a00); + u32 wake_up_packet_memory[32]; + CLIB_PAD_FROM_TO (0x5a80, 0x5c00); + u32 redirection_table_82598[32]; + u32 rss_random_keys_82598[10]; + CLIB_PAD_FROM_TO (0x5ca8, 0x6000); + + ixge_dma_regs_t tx_dma[128]; + + u32 pf_vm_vlan_insert[64]; + u32 tx_dma_tcp_max_alloc_size_requests; + CLIB_PAD_FROM_TO (0x8104, 0x8110); + u32 vf_tx_enable[2]; + CLIB_PAD_FROM_TO (0x8118, 0x8120); + /* [0] dcb mode enable + [1] virtualization mode enable + [3:2] number of tcs/qs per pool. */ + u32 multiple_tx_queues_command; + CLIB_PAD_FROM_TO (0x8124, 0x8200); + u32 pf_vf_anti_spoof[8]; + u32 pf_dma_tx_switch_control; + CLIB_PAD_FROM_TO (0x8224, 0x82e0); + u32 tx_strict_low_latency_queues[4]; + CLIB_PAD_FROM_TO (0x82f0, 0x8600); + u32 tx_queue_stats_mapping_82599[32]; + u32 tx_queue_packet_counts[32]; + u32 tx_queue_byte_counts[32][2]; + + struct + { + u32 control; + u32 status; + u32 buffer_almost_full; + CLIB_PAD_FROM_TO (0x880c, 0x8810); + u32 buffer_min_ifg; + CLIB_PAD_FROM_TO (0x8814, 0x8900); + } tx_security; + + struct + { + u32 index; + u32 salt; + u32 key[4]; + CLIB_PAD_FROM_TO (0x8918, 0x8a00); + } tx_ipsec; + + struct + { + u32 capabilities; + u32 control; + u32 tx_sci[2]; + u32 sa; + u32 sa_pn[2]; + u32 key[2][4]; + /* untagged packets, encrypted packets, protected packets, + encrypted bytes, protected bytes */ + u32 stats[5]; + CLIB_PAD_FROM_TO (0x8a50, 0x8c00); + } tx_link_security; + + struct + { + u32 control; + u32 timestamp_value[2]; + u32 system_time[2]; + u32 increment_attributes; + u32 time_adjustment_offset[2]; + u32 aux_control; + u32 target_time[2][2]; + CLIB_PAD_FROM_TO (0x8c34, 0x8c3c); + u32 aux_time_stamp[2][2]; + CLIB_PAD_FROM_TO (0x8c4c, 0x8d00); + } tx_timesync; + + struct + { + u32 control; + u32 status; + CLIB_PAD_FROM_TO (0x8d08, 0x8e00); + } rx_security; + + struct + { + u32 index; + u32 ip_address[4]; + u32 spi; + u32 ip_index; + u32 key[4]; + u32 salt; + u32 mode; + CLIB_PAD_FROM_TO (0x8e34, 0x8f00); + } rx_ipsec; + + struct + { + u32 capabilities; + u32 control; + u32 sci[2]; + u32 sa[2]; + u32 sa_pn[2]; + u32 key[2][4]; + /* see datasheet */ + u32 stats[17]; + CLIB_PAD_FROM_TO (0x8f84, 0x9000); + } rx_link_security; + + /* 4 wake up, 2 management, 2 wake up. */ + u32 flexible_filters[8][16][4]; + CLIB_PAD_FROM_TO (0x9800, 0xa000); + + /* 4096 bits. */ + u32 vlan_filter[128]; + + /* [0] ethernet address [31:0] + [1] [15:0] ethernet address [47:32] + [31] valid bit. + Index 0 is read from eeprom after reset. */ + u32 rx_ethernet_address1[128][2]; + + /* select one of 64 pools for each rx address. */ + u32 rx_ethernet_address_pool_select[128][2]; + CLIB_PAD_FROM_TO (0xaa00, 0xc800); + u32 tx_priority_to_traffic_class; + CLIB_PAD_FROM_TO (0xc804, 0xcc00); + + /* In bytes units of 1k. Total packet buffer is 160k. */ + u32 tx_packet_buffer_size[8]; + + CLIB_PAD_FROM_TO (0xcc20, 0xcd10); + u32 tx_manageability_tc_mapping; + CLIB_PAD_FROM_TO (0xcd14, 0xcd20); + u32 dcb_tx_packet_plane_t2_config[8]; + u32 dcb_tx_packet_plane_t2_status[8]; + CLIB_PAD_FROM_TO (0xcd60, 0xce00); + + u32 tx_flow_control_status; + CLIB_PAD_FROM_TO (0xce04, 0xd000); + + ixge_dma_regs_t rx_dma1[64]; + + struct + { + /* Bigendian ip4 src/dst address. */ + u32 src_address[128]; + u32 dst_address[128]; + + /* TCP/UDP ports [15:0] src [31:16] dst; bigendian. */ + u32 tcp_udp_port[128]; + + /* [1:0] protocol tcp, udp, sctp, other + [4:2] match priority (highest wins) + [13:8] pool + [25] src address match disable + [26] dst address match disable + [27] src port match disable + [28] dst port match disable + [29] protocol match disable + [30] pool match disable + [31] enable. */ + u32 control[128]; + + /* [12] size bypass + [19:13] must be 0x80 + [20] low-latency interrupt + [27:21] rx queue. */ + u32 interrupt[128]; + } ip4_filters; + + CLIB_PAD_FROM_TO (0xea00, 0xeb00); + /* 4 bit rss output index indexed by 7 bit hash. + 128 8 bit fields = 32 registers. */ + u32 redirection_table_82599[32]; + + u32 rss_random_key_82599[10]; + CLIB_PAD_FROM_TO (0xeba8, 0xec00); + /* [15:0] reserved + [22:16] rx queue index + [29] low-latency interrupt on match + [31] enable */ + u32 ethernet_type_queue_select[8]; + CLIB_PAD_FROM_TO (0xec20, 0xec30); + u32 syn_packet_queue_filter; + CLIB_PAD_FROM_TO (0xec34, 0xec60); + u32 immediate_interrupt_rx_vlan_priority; + CLIB_PAD_FROM_TO (0xec64, 0xec70); + u32 rss_queues_per_traffic_class; + CLIB_PAD_FROM_TO (0xec74, 0xec90); + u32 lli_size_threshold; + CLIB_PAD_FROM_TO (0xec94, 0xed00); + + struct + { + u32 control; + CLIB_PAD_FROM_TO (0xed04, 0xed10); + u32 table[8]; + CLIB_PAD_FROM_TO (0xed30, 0xee00); + } fcoe_redirection; + + struct + { + /* [1:0] packet buffer allocation 0 => disabled, else 64k*2^(f-1) + [3] packet buffer initialization done + [4] perfetch match mode + [5] report status in rss field of rx descriptors + [7] report status always + [14:8] drop queue + [20:16] flex 2 byte packet offset (units of 2 bytes) + [27:24] max linked list length + [31:28] full threshold. */ + u32 control; + CLIB_PAD_FROM_TO (0xee04, 0xee0c); + + u32 data[8]; + + /* [1:0] 0 => no action, 1 => add, 2 => remove, 3 => query. + [2] valid filter found by query command + [3] filter update override + [4] ip6 adress table + [6:5] l4 protocol reserved, udp, tcp, sctp + [7] is ip6 + [8] clear head/tail + [9] packet drop action + [10] matched packet generates low-latency interrupt + [11] last in linked list + [12] collision + [15] rx queue enable + [22:16] rx queue + [29:24] pool. */ + u32 command; + + CLIB_PAD_FROM_TO (0xee30, 0xee3c); + /* ip4 dst/src address, tcp ports, udp ports. + set bits mean bit is ignored. */ + u32 ip4_masks[4]; + u32 filter_length; + u32 usage_stats; + u32 failed_usage_stats; + u32 filters_match_stats; + u32 filters_miss_stats; + CLIB_PAD_FROM_TO (0xee60, 0xee68); + /* Lookup, signature. */ + u32 hash_keys[2]; + /* [15:0] ip6 src address 1 bit per byte + [31:16] ip6 dst address. */ + u32 ip6_mask; + /* [0] vlan id + [1] vlan priority + [2] pool + [3] ip protocol + [4] flex + [5] dst ip6. */ + u32 other_mask; + CLIB_PAD_FROM_TO (0xee78, 0xf000); + } flow_director; + + struct + { + u32 l2_control[64]; + u32 vlan_pool_filter[64]; + u32 vlan_pool_filter_bitmap[128]; + u32 dst_ethernet_address[128]; + u32 mirror_rule[4]; + u32 mirror_rule_vlan[8]; + u32 mirror_rule_pool[8]; + CLIB_PAD_FROM_TO (0xf650, 0x10010); + } pf_bar; + + u32 eeprom_flash_control; + /* [0] start + [1] done + [15:2] address + [31:16] read data. */ + u32 eeprom_read; + CLIB_PAD_FROM_TO (0x10018, 0x1001c); + u32 flash_access; + CLIB_PAD_FROM_TO (0x10020, 0x10114); + u32 flash_data; + u32 flash_control; + u32 flash_read_data; + CLIB_PAD_FROM_TO (0x10120, 0x1013c); + u32 flash_opcode; + u32 software_semaphore; + CLIB_PAD_FROM_TO (0x10144, 0x10148); + u32 firmware_semaphore; + CLIB_PAD_FROM_TO (0x1014c, 0x10160); + u32 software_firmware_sync; + CLIB_PAD_FROM_TO (0x10164, 0x10200); + u32 general_rx_control; + CLIB_PAD_FROM_TO (0x10204, 0x11000); + + struct + { + u32 control; + CLIB_PAD_FROM_TO (0x11004, 0x11010); + /* [3:0] enable counters + [7:4] leaky bucket counter mode + [29] reset + [30] stop + [31] start. */ + u32 counter_control; + /* [7:0],[15:8],[23:16],[31:24] event for counters 0-3. + event codes: + 0x0 bad tlp + 0x10 reqs that reached timeout + etc. */ + u32 counter_event; + CLIB_PAD_FROM_TO (0x11018, 0x11020); + u32 counters_clear_on_read[4]; + u32 counter_config[4]; + struct + { + u32 address; + u32 data; + } indirect_access; + CLIB_PAD_FROM_TO (0x11048, 0x11050); + u32 extended_control; + CLIB_PAD_FROM_TO (0x11054, 0x11064); + u32 mirrored_revision_id; + CLIB_PAD_FROM_TO (0x11068, 0x11070); + u32 dca_requester_id_information; + + /* [0] global disable + [4:1] mode: 0 => legacy, 1 => dca 1.0. */ + u32 dca_control; + CLIB_PAD_FROM_TO (0x11078, 0x110b0); + /* [0] pci completion abort + [1] unsupported i/o address + [2] wrong byte enable + [3] pci timeout */ + u32 pcie_interrupt_status; + CLIB_PAD_FROM_TO (0x110b4, 0x110b8); + u32 pcie_interrupt_enable; + CLIB_PAD_FROM_TO (0x110bc, 0x110c0); + u32 msi_x_pba_clear[8]; + CLIB_PAD_FROM_TO (0x110e0, 0x12300); + } pcie; + + u32 interrupt_throttle1[128 - 24]; + CLIB_PAD_FROM_TO (0x124a0, 0x14f00); + + u32 core_analog_config; + CLIB_PAD_FROM_TO (0x14f04, 0x14f10); + u32 core_common_config; + CLIB_PAD_FROM_TO (0x14f14, 0x15f14); + + u32 link_sec_software_firmware_interface; +} ixge_regs_t; + +typedef union +{ + struct + { + /* Addresses bigendian. */ + union + { + struct + { + ip6_address_t src_address; + u32 unused[1]; + } ip6; + struct + { + u32 unused[3]; + ip4_address_t src_address, dst_address; + } ip4; + }; + + /* [15:0] src port (little endian). + [31:16] dst port. */ + u32 tcp_udp_ports; + + /* [15:0] vlan (cfi bit set to 0). + [31:16] flex bytes. bigendian. */ + u32 vlan_and_flex_word; + + /* [14:0] hash + [15] bucket valid + [31:16] signature (signature filers)/sw-index (perfect match). */ + u32 hash; + }; + + u32 as_u32[8]; +} ixge_flow_director_key_t; + +always_inline void +ixge_throttle_queue_interrupt (ixge_regs_t * r, + u32 queue_interrupt_index, + f64 inter_interrupt_interval_in_secs) +{ + volatile u32 *tr = + (queue_interrupt_index < ARRAY_LEN (r->interrupt.throttle0) + ? &r->interrupt.throttle0[queue_interrupt_index] + : &r->interrupt_throttle1[queue_interrupt_index]); + ASSERT (queue_interrupt_index < 128); + u32 v; + i32 i, mask = (1 << 9) - 1; + + i = flt_round_nearest (inter_interrupt_interval_in_secs / 2e-6); + i = i < 1 ? 1 : i; + i = i >= mask ? mask : i; + + v = tr[0]; + v &= ~(mask << 3); + v |= i << 3; + tr[0] = v; +} + +#define foreach_ixge_counter \ + _ (0x40d0, rx_total_packets) \ + _64 (0x40c0, rx_total_bytes) \ + _ (0x41b0, rx_good_packets_before_filtering) \ + _64 (0x41b4, rx_good_bytes_before_filtering) \ + _ (0x2f50, rx_dma_good_packets) \ + _64 (0x2f54, rx_dma_good_bytes) \ + _ (0x2f5c, rx_dma_duplicated_good_packets) \ + _64 (0x2f60, rx_dma_duplicated_good_bytes) \ + _ (0x2f68, rx_dma_good_loopback_packets) \ + _64 (0x2f6c, rx_dma_good_loopback_bytes) \ + _ (0x2f74, rx_dma_good_duplicated_loopback_packets) \ + _64 (0x2f78, rx_dma_good_duplicated_loopback_bytes) \ + _ (0x4074, rx_good_packets) \ + _64 (0x4088, rx_good_bytes) \ + _ (0x407c, rx_multicast_packets) \ + _ (0x4078, rx_broadcast_packets) \ + _ (0x405c, rx_64_byte_packets) \ + _ (0x4060, rx_65_127_byte_packets) \ + _ (0x4064, rx_128_255_byte_packets) \ + _ (0x4068, rx_256_511_byte_packets) \ + _ (0x406c, rx_512_1023_byte_packets) \ + _ (0x4070, rx_gt_1023_byte_packets) \ + _ (0x4000, rx_crc_errors) \ + _ (0x4120, rx_ip_checksum_errors) \ + _ (0x4004, rx_illegal_symbol_errors) \ + _ (0x4008, rx_error_symbol_errors) \ + _ (0x4034, rx_mac_local_faults) \ + _ (0x4038, rx_mac_remote_faults) \ + _ (0x4040, rx_length_errors) \ + _ (0x41a4, rx_xons) \ + _ (0x41a8, rx_xoffs) \ + _ (0x40a4, rx_undersize_packets) \ + _ (0x40a8, rx_fragments) \ + _ (0x40ac, rx_oversize_packets) \ + _ (0x40b0, rx_jabbers) \ + _ (0x40b4, rx_management_packets) \ + _ (0x40b8, rx_management_drops) \ + _ (0x3fa0, rx_missed_packets_pool_0) \ + _ (0x40d4, tx_total_packets) \ + _ (0x4080, tx_good_packets) \ + _64 (0x4090, tx_good_bytes) \ + _ (0x40f0, tx_multicast_packets) \ + _ (0x40f4, tx_broadcast_packets) \ + _ (0x87a0, tx_dma_good_packets) \ + _64 (0x87a4, tx_dma_good_bytes) \ + _ (0x40d8, tx_64_byte_packets) \ + _ (0x40dc, tx_65_127_byte_packets) \ + _ (0x40e0, tx_128_255_byte_packets) \ + _ (0x40e4, tx_256_511_byte_packets) \ + _ (0x40e8, tx_512_1023_byte_packets) \ + _ (0x40ec, tx_gt_1023_byte_packets) \ + _ (0x4010, tx_undersize_drops) \ + _ (0x8780, switch_security_violation_packets) \ + _ (0x5118, fc_crc_errors) \ + _ (0x241c, fc_rx_drops) \ + _ (0x2424, fc_last_error_count) \ + _ (0x2428, fcoe_rx_packets) \ + _ (0x242c, fcoe_rx_dwords) \ + _ (0x8784, fcoe_tx_packets) \ + _ (0x8788, fcoe_tx_dwords) \ + _ (0x1030, queue_0_rx_count) \ + _ (0x1430, queue_0_drop_count) \ + _ (0x1070, queue_1_rx_count) \ + _ (0x1470, queue_1_drop_count) \ + _ (0x10b0, queue_2_rx_count) \ + _ (0x14b0, queue_2_drop_count) \ + _ (0x10f0, queue_3_rx_count) \ + _ (0x14f0, queue_3_drop_count) \ + _ (0x1130, queue_4_rx_count) \ + _ (0x1530, queue_4_drop_count) \ + _ (0x1170, queue_5_rx_count) \ + _ (0x1570, queue_5_drop_count) \ + _ (0x11b0, queue_6_rx_count) \ + _ (0x15b0, queue_6_drop_count) \ + _ (0x11f0, queue_7_rx_count) \ + _ (0x15f0, queue_7_drop_count) \ + _ (0x1230, queue_8_rx_count) \ + _ (0x1630, queue_8_drop_count) \ + _ (0x1270, queue_9_rx_count) \ + _ (0x1270, queue_9_drop_count) + + + + +typedef enum +{ +#define _(a,f) IXGE_COUNTER_##f, +#define _64(a,f) _(a,f) + foreach_ixge_counter +#undef _ +#undef _64 + IXGE_N_COUNTER, +} ixge_counter_type_t; + +typedef struct +{ + u32 mdio_address; + + /* 32 bit ID read from ID registers. */ + u32 id; +} ixge_phy_t; + +typedef struct +{ + /* Cache aligned descriptors. */ + ixge_descriptor_t *descriptors; + + /* Number of descriptors in table. */ + u32 n_descriptors; + + /* Software head and tail pointers into descriptor ring. */ + u32 head_index, tail_index; + + /* Index into dma_queues vector. */ + u32 queue_index; + + /* Buffer indices corresponding to each active descriptor. */ + u32 *descriptor_buffer_indices; + + union + { + struct + { + u32 *volatile head_index_write_back; + + u32 n_buffers_on_ring; + } tx; + + struct + { + /* Buffer indices to use to replenish each descriptor. */ + u32 *replenish_buffer_indices; + + vlib_node_runtime_t *node; + u32 next_index; + + u32 saved_start_of_packet_buffer_index; + + u32 saved_start_of_packet_next_index; + u32 saved_last_buffer_index; + + u32 is_start_of_packet; + + u32 n_descriptors_done_total; + + u32 n_descriptors_done_this_call; + + u32 n_bytes; + } rx; + }; +} ixge_dma_queue_t; + +#define foreach_ixge_pci_device_id \ + _ (82598, 0x10b6) \ + _ (82598_bx, 0x1508) \ + _ (82598af_dual_port, 0x10c6) \ + _ (82598af_single_port, 0x10c7) \ + _ (82598at, 0x10c8) \ + _ (82598at2, 0x150b) \ + _ (82598eb_sfp_lom, 0x10db) \ + _ (82598eb_cx4, 0x10dd) \ + _ (82598_cx4_dual_port, 0x10ec) \ + _ (82598_da_dual_port, 0x10f1) \ + _ (82598_sr_dual_port_em, 0x10e1) \ + _ (82598eb_xf_lr, 0x10f4) \ + _ (82599_kx4, 0x10f7) \ + _ (82599_kx4_mezz, 0x1514) \ + _ (82599_kr, 0x1517) \ + _ (82599_combo_backplane, 0x10f8) \ + _ (82599_cx4, 0x10f9) \ + _ (82599_sfp, 0x10fb) \ + _ (82599_backplane_fcoe, 0x152a) \ + _ (82599_sfp_fcoe, 0x1529) \ + _ (82599_sfp_em, 0x1507) \ + _ (82599_xaui_lom, 0x10fc) \ + _ (82599_t3_lom, 0x151c) \ + _ (x540t, 0x1528) + +typedef enum +{ +#define _(f,n) IXGE_##f = n, + foreach_ixge_pci_device_id +#undef _ +} ixge_pci_device_id_t; + +typedef struct +{ + /* registers */ + ixge_regs_t *regs; + + /* Specific next index when using dynamic redirection */ + u32 per_interface_next_index; + + /* PCI bus info. */ + vlib_pci_device_t pci_device; + + /* From PCI config space header. */ + ixge_pci_device_id_t device_id; + + u16 device_index; + + /* 0 or 1. */ + u16 pci_function; + + /* VLIB interface for this instance. */ + u32 vlib_hw_if_index, vlib_sw_if_index; + + ixge_dma_queue_t *dma_queues[VLIB_N_RX_TX]; + + /* Phy index (0 or 1) and address on MDI bus. */ + u32 phy_index; + ixge_phy_t phys[2]; + + /* Value of link_status register at last link change. */ + u32 link_status_at_last_link_change; + + i2c_bus_t i2c_bus; + sfp_eeprom_t sfp_eeprom; + + /* Counters. */ + u64 counters[IXGE_N_COUNTER], counters_last_clear[IXGE_N_COUNTER]; +} ixge_device_t; + +typedef struct +{ + vlib_main_t *vlib_main; + + /* Vector of devices. */ + ixge_device_t *devices; + + /* Descriptor ring sizes. */ + u32 n_descriptors[VLIB_N_RX_TX]; + + /* RX buffer size. Must be at least 1k; will be rounded to + next largest 1k size. */ + u32 n_bytes_in_rx_buffer; + + u32 n_descriptors_per_cache_line; + + u32 vlib_buffer_free_list_index; + + u32 process_node_index; + + /* Template and mask for initializing/validating TX descriptors. */ + ixge_tx_descriptor_t tx_descriptor_template, tx_descriptor_template_mask; + + /* Vector of buffers for which TX is done and can be freed. */ + u32 *tx_buffers_pending_free; + + u32 *rx_buffers_to_add; + + f64 time_last_stats_update; + + vlib_physmem_region_index_t physmem_region; +} ixge_main_t; + +ixge_main_t ixge_main; +vnet_device_class_t ixge_device_class; + +typedef enum +{ + IXGE_RX_NEXT_IP4_INPUT, + IXGE_RX_NEXT_IP6_INPUT, + IXGE_RX_NEXT_ETHERNET_INPUT, + IXGE_RX_NEXT_DROP, + IXGE_RX_N_NEXT, +} ixge_rx_next_t; + +void ixge_set_next_node (ixge_rx_next_t, char *); + +#endif /* included_ixge_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/lb.am b/src/plugins/lb.am new file mode 100644 index 00000000..352358fa --- /dev/null +++ b/src/plugins/lb.am @@ -0,0 +1,42 @@ +# Copyright (c) 2016 Cisco Systems, Inc. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +vppapitestplugins_LTLIBRARIES += lb_test_plugin.la +vppplugins_LTLIBRARIES += lb_plugin.la + +lb_plugin_la_SOURCES = \ + lb/lb.c \ + lb/node.c \ + lb/cli.c \ + lb/util.c \ + lb/refcount.c \ + lb/api.c + +BUILT_SOURCES += \ + lb/lb.api.h \ + lb/lb.api.json + +API_FILES += lb/lb.api + +noinst_HEADERS += \ + lb/lb.h \ + lb/util.h \ + lb/refcount.h \ + lb/lbhash.h \ + lb/lb.api.h + +lb_test_plugin_la_SOURCES = \ + lb/lb_test.c \ + lb/lb_plugin.api.h + +# vi:syntax=automake diff --git a/src/plugins/lb/api.c b/src/plugins/lb/api.c new file mode 100644 index 00000000..9e3bcd65 --- /dev/null +++ b/src/plugins/lb/api.c @@ -0,0 +1,209 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <lb/lb.h> + +#include <vppinfra/byte_order.h> +#include <vlibapi/api.h> +#include <vlibapi/api.h> +#include <vlibmemory/api.h> +#include <vlibsocket/api.h> + +#define vl_msg_id(n,h) n, +typedef enum { +#include <lb/lb.api.h> + /* We'll want to know how many messages IDs we need... */ + VL_MSG_FIRST_AVAILABLE, +} vl_msg_id_t; +#undef vl_msg_id + + +/* define message structures */ +#define vl_typedefs +#include <lb/lb.api.h> +#undef vl_typedefs + +/* define generated endian-swappers */ +#define vl_endianfun +#include <lb/lb.api.h> +#undef vl_endianfun + +#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__) + +/* Get the API version number */ +#define vl_api_version(n,v) static u32 api_version=(v); +#include <lb/lb.api.h> +#undef vl_api_version + +#define vl_msg_name_crc_list +#include <lb/lb.api.h> +#undef vl_msg_name_crc_list + + +#define REPLY_MSG_ID_BASE lbm->msg_id_base +#include <vlibapi/api_helper_macros.h> + +static void +setup_message_id_table (lb_main_t * lbm, api_main_t * am) +{ +#define _(id,n,crc) \ + vl_msg_api_add_msg_name_crc (am, #n "_" #crc, id + lbm->msg_id_base); + foreach_vl_msg_name_crc_lb; +#undef _ +} + +/* Macro to finish up custom dump fns */ +#define FINISH \ + vec_add1 (s, 0); \ + vl_print (handle, (char *)s); \ + vec_free (s); \ + return handle; + +static void +vl_api_lb_conf_t_handler +(vl_api_lb_conf_t * mp) +{ + lb_main_t *lbm = &lb_main; + vl_api_lb_conf_reply_t * rmp; + int rv = 0; + + rv = lb_conf((ip4_address_t *)&mp->ip4_src_address, + (ip6_address_t *)mp->ip6_src_address, + mp->sticky_buckets_per_core, + mp->flow_timeout); + + REPLY_MACRO (VL_API_LB_CONF_REPLY); +} + +static void *vl_api_lb_conf_t_print +(vl_api_lb_conf_t *mp, void * handle) +{ + u8 * s; + s = format (0, "SCRIPT: lb_conf "); + s = format (s, "%U ", format_ip4_address, (ip4_address_t *)&mp->ip4_src_address); + s = format (s, "%U ", format_ip6_address, (ip6_address_t *)mp->ip6_src_address); + s = format (s, "%u ", mp->sticky_buckets_per_core); + s = format (s, "%u ", mp->flow_timeout); + FINISH; +} + + +static void +vl_api_lb_add_del_vip_t_handler +(vl_api_lb_add_del_vip_t * mp) +{ + lb_main_t *lbm = &lb_main; + vl_api_lb_conf_reply_t * rmp; + int rv = 0; + ip46_address_t prefix; + memcpy(&prefix.ip6, mp->ip_prefix, sizeof(prefix.ip6)); + + if (mp->is_del) { + u32 vip_index; + if (!(rv = lb_vip_find_index(&prefix, mp->prefix_length, &vip_index))) + rv = lb_vip_del(vip_index); + } else { + u32 vip_index; + lb_vip_type_t type; + if (ip46_prefix_is_ip4(&prefix, mp->prefix_length)) { + type = mp->is_gre4?LB_VIP_TYPE_IP4_GRE4:LB_VIP_TYPE_IP4_GRE6; + } else { + type = mp->is_gre4?LB_VIP_TYPE_IP6_GRE4:LB_VIP_TYPE_IP6_GRE6; + } + + rv = lb_vip_add(&prefix, mp->prefix_length, type, + mp->new_flows_table_length, &vip_index); + } + REPLY_MACRO (VL_API_LB_CONF_REPLY); +} + +static void *vl_api_lb_add_del_vip_t_print +(vl_api_lb_add_del_vip_t *mp, void * handle) +{ + u8 * s; + s = format (0, "SCRIPT: lb_add_del_vip "); + s = format (s, "%U ", format_ip46_prefix, + (ip46_address_t *)mp->ip_prefix, mp->prefix_length, IP46_TYPE_ANY); + s = format (s, "%s ", mp->is_gre4?"gre4":"gre6"); + s = format (s, "%u ", mp->new_flows_table_length); + s = format (s, "%s ", mp->is_del?"del":"add"); + FINISH; +} + +static void +vl_api_lb_add_del_as_t_handler +(vl_api_lb_add_del_as_t * mp) +{ + lb_main_t *lbm = &lb_main; + vl_api_lb_conf_reply_t * rmp; + int rv = 0; + u32 vip_index; + if ((rv = lb_vip_find_index((ip46_address_t *)mp->vip_ip_prefix, + mp->vip_prefix_length, &vip_index))) + goto done; + + if (mp->is_del) + rv = lb_vip_del_ass(vip_index, (ip46_address_t *)mp->as_address, 1); + else + rv = lb_vip_add_ass(vip_index, (ip46_address_t *)mp->as_address, 1); + +done: + REPLY_MACRO (VL_API_LB_CONF_REPLY); +} + +static void *vl_api_lb_add_del_as_t_print +(vl_api_lb_add_del_as_t *mp, void * handle) +{ + u8 * s; + s = format (0, "SCRIPT: lb_add_del_as "); + s = format (s, "%U ", format_ip46_prefix, + (ip46_address_t *)mp->vip_ip_prefix, mp->vip_prefix_length, IP46_TYPE_ANY); + s = format (s, "%U ", format_ip46_address, + (ip46_address_t *)mp->as_address, IP46_TYPE_ANY); + s = format (s, "%s ", mp->is_del?"del":"add"); + FINISH; +} + +/* List of message types that this plugin understands */ +#define foreach_lb_plugin_api_msg \ +_(LB_CONF, lb_conf) \ +_(LB_ADD_DEL_VIP, lb_add_del_vip) \ +_(LB_ADD_DEL_AS, lb_add_del_as) + +static clib_error_t * lb_api_init (vlib_main_t * vm) +{ + lb_main_t *lbm = &lb_main; + u8 *name = format (0, "lb_%08x%c", api_version, 0); + lbm->msg_id_base = vl_msg_api_get_msg_ids + ((char *) name, VL_MSG_FIRST_AVAILABLE); + +#define _(N,n) \ + vl_msg_api_set_handlers((VL_API_##N + lbm->msg_id_base), \ + #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_lb_plugin_api_msg; +#undef _ + + /* Add our API messages to the global name_crc hash table */ + setup_message_id_table (lbm, &api_main); + + return 0; +} + +VLIB_INIT_FUNCTION (lb_api_init); diff --git a/src/plugins/lb/cli.c b/src/plugins/lb/cli.c new file mode 100644 index 00000000..f6d65201 --- /dev/null +++ b/src/plugins/lb/cli.c @@ -0,0 +1,275 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <lb/lb.h> +#include <lb/util.h> + +static clib_error_t * +lb_vip_command_fn (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + ip46_address_t prefix; + u8 plen; + u32 new_len = 1024; + u8 del = 0; + int ret; + u32 gre4 = 0; + lb_vip_type_t type; + clib_error_t *error = 0; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + if (!unformat(line_input, "%U", unformat_ip46_prefix, &prefix, &plen, IP46_TYPE_ANY)) { + error = clib_error_return (0, "invalid vip prefix: '%U'", + format_unformat_error, line_input); + goto done; + } + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat(line_input, "new_len %d", &new_len)) + ; + else if (unformat(line_input, "del")) + del = 1; + else if (unformat(line_input, "encap gre4")) + gre4 = 1; + else if (unformat(line_input, "encap gre6")) + gre4 = 0; + else { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } + } + + + if (ip46_prefix_is_ip4(&prefix, plen)) { + type = (gre4)?LB_VIP_TYPE_IP4_GRE4:LB_VIP_TYPE_IP4_GRE6; + } else { + type = (gre4)?LB_VIP_TYPE_IP6_GRE4:LB_VIP_TYPE_IP6_GRE6; + } + + lb_garbage_collection(); + + u32 index; + if (!del) { + if ((ret = lb_vip_add(&prefix, plen, type, new_len, &index))) { + error = clib_error_return (0, "lb_vip_add error %d", ret); + goto done; + } else { + vlib_cli_output(vm, "lb_vip_add ok %d", index); + } + } else { + if ((ret = lb_vip_find_index(&prefix, plen, &index))) { + error = clib_error_return (0, "lb_vip_find_index error %d", ret); + goto done; + } else if ((ret = lb_vip_del(index))) { + error = clib_error_return (0, "lb_vip_del error %d", ret); + goto done; + } + } + +done: + unformat_free (line_input); + + return error; +} + +VLIB_CLI_COMMAND (lb_vip_command, static) = +{ + .path = "lb vip", + .short_help = "lb vip <prefix> [encap (gre6|gre4)] [new_len <n>] [del]", + .function = lb_vip_command_fn, +}; + +static clib_error_t * +lb_as_command_fn (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + ip46_address_t vip_prefix, as_addr; + u8 vip_plen; + ip46_address_t *as_array = 0; + u32 vip_index; + u8 del = 0; + int ret; + clib_error_t *error = 0; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + if (!unformat(line_input, "%U", unformat_ip46_prefix, &vip_prefix, &vip_plen, IP46_TYPE_ANY)) { + error = clib_error_return (0, "invalid as address: '%U'", + format_unformat_error, line_input); + goto done; + } + + if ((ret = lb_vip_find_index(&vip_prefix, vip_plen, &vip_index))) { + error = clib_error_return (0, "lb_vip_find_index error %d", ret); + goto done; + } + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat(line_input, "%U", unformat_ip46_address, &as_addr, IP46_TYPE_ANY)) { + vec_add1(as_array, as_addr); + } else if (unformat(line_input, "del")) { + del = 1; + } else { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } + } + + if (!vec_len(as_array)) { + error = clib_error_return (0, "No AS address provided"); + goto done; + } + + lb_garbage_collection(); + clib_warning("vip index is %d", vip_index); + + if (del) { + if ((ret = lb_vip_del_ass(vip_index, as_array, vec_len(as_array)))) { + error = clib_error_return (0, "lb_vip_del_ass error %d", ret); + goto done; + } + } else { + if ((ret = lb_vip_add_ass(vip_index, as_array, vec_len(as_array)))) { + error = clib_error_return (0, "lb_vip_add_ass error %d", ret); + goto done; + } + } + +done: + unformat_free (line_input); + vec_free(as_array); + + return error; +} + +VLIB_CLI_COMMAND (lb_as_command, static) = +{ + .path = "lb as", + .short_help = "lb as <vip-prefix> [<address> [<address> [...]]] [del]", + .function = lb_as_command_fn, +}; + +static clib_error_t * +lb_conf_command_fn (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + lb_main_t *lbm = &lb_main; + unformat_input_t _line_input, *line_input = &_line_input; + ip4_address_t ip4 = lbm->ip4_src_address; + ip6_address_t ip6 = lbm->ip6_src_address; + u32 per_cpu_sticky_buckets = lbm->per_cpu_sticky_buckets; + u32 per_cpu_sticky_buckets_log2 = 0; + u32 flow_timeout = lbm->flow_timeout; + int ret; + clib_error_t *error = 0; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat(line_input, "ip4-src-address %U", unformat_ip4_address, &ip4)) + ; + else if (unformat(line_input, "ip6-src-address %U", unformat_ip6_address, &ip6)) + ; + else if (unformat(line_input, "buckets %d", &per_cpu_sticky_buckets)) + ; + else if (unformat(line_input, "buckets-log2 %d", &per_cpu_sticky_buckets_log2)) { + if (per_cpu_sticky_buckets_log2 >= 32) + return clib_error_return (0, "buckets-log2 value is too high"); + per_cpu_sticky_buckets = 1 << per_cpu_sticky_buckets_log2; + } else if (unformat(line_input, "timeout %d", &flow_timeout)) + ; + else { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } + } + + lb_garbage_collection(); + + if ((ret = lb_conf(&ip4, &ip6, per_cpu_sticky_buckets, flow_timeout))) { + error = clib_error_return (0, "lb_conf error %d", ret); + goto done; + } + +done: + unformat_free (line_input); + + return error; +} + +VLIB_CLI_COMMAND (lb_conf_command, static) = +{ + .path = "lb conf", + .short_help = "lb conf [ip4-src-address <addr>] [ip6-src-address <addr>] [buckets <n>] [timeout <s>]", + .function = lb_conf_command_fn, +}; + +static clib_error_t * +lb_show_command_fn (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + vlib_cli_output(vm, "%U", format_lb_main); + return NULL; +} + + +VLIB_CLI_COMMAND (lb_show_command, static) = +{ + .path = "show lb", + .short_help = "show lb", + .function = lb_show_command_fn, +}; + +static clib_error_t * +lb_show_vips_command_fn (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + unformat_input_t line_input; + lb_main_t *lbm = &lb_main; + lb_vip_t *vip; + u8 verbose = 0; + + if (!unformat_user (input, unformat_line_input, &line_input)) + return 0; + + if (unformat(&line_input, "verbose")) + verbose = 1; + + pool_foreach(vip, lbm->vips, { + vlib_cli_output(vm, "%U\n", verbose?format_lb_vip_detailed:format_lb_vip, vip); + }); + + unformat_free (&line_input); + return NULL; +} + +VLIB_CLI_COMMAND (lb_show_vips_command, static) = +{ + .path = "show lb vips", + .short_help = "show lb vips [verbose]", + .function = lb_show_vips_command_fn, +}; diff --git a/src/plugins/lb/lb.api b/src/plugins/lb/lb.api new file mode 100644 index 00000000..32cc669b --- /dev/null +++ b/src/plugins/lb/lb.api @@ -0,0 +1,56 @@ +/** \brief Configure Load-Balancer global parameters + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param ip4_src_address - IPv4 address to be used as source for IPv4 GRE traffic. + @param ip6_src_address - IPv6 address to be used as source for IPv6 GRE traffic. + @param n_sticky_buckets - Number of buckets *per worker thread* in the + established flow table (must be power of 2). + @param flow_timeout - Time in seconds after which, if no packet is received + for a given flow, the flow is removed from the established flow table. +*/ +autoreply define lb_conf +{ + u32 client_index; + u32 context; + u32 ip4_src_address; + u8 ip6_src_address[16]; + u32 sticky_buckets_per_core; + u32 flow_timeout; +}; + +/** \brief Add a virtual address (or prefix) + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param ip_prefix - IP address (IPv4 in lower order 32 bits). + @param prefix_length - IP prefix length (96 + 'IPv4 prefix length' for IPv4). + @param is_gre4 - Encap is ip4 GRE (ip6 GRE otherwise). + @param new_flows_table_length - Size of the new connections flow table used + for this VIP (must be power of 2). + @param is_del - The VIP should be removed. +*/ +autoreply define lb_add_del_vip { + u32 client_index; + u32 context; + u8 ip_prefix[16]; + u8 prefix_length; + u8 is_gre4; + u32 new_flows_table_length; + u8 is_del; +}; + +/** \brief Add an application server for a given VIP + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param vip_ip_prefix - VIP IP address (IPv4 in lower order 32 bits). + @param vip_ip_prefix - VIP IP prefix length (96 + 'IPv4 prefix length' for IPv4). + @param as_address - The application server address (IPv4 in lower order 32 bits). + @param is_del - The AS should be removed. +*/ +autoreply define lb_add_del_as { + u32 client_index; + u32 context; + u8 vip_ip_prefix[16]; + u8 vip_prefix_length; + u8 as_address[16]; + u8 is_del; +}; diff --git a/src/plugins/lb/lb.c b/src/plugins/lb/lb.c new file mode 100644 index 00000000..cc3f8532 --- /dev/null +++ b/src/plugins/lb/lb.c @@ -0,0 +1,841 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <lb/lb.h> +#include <vnet/plugin/plugin.h> +#include <vpp/app/version.h> +#include <vnet/api_errno.h> + +//GC runs at most once every so many seconds +#define LB_GARBAGE_RUN 60 + +//After so many seconds. It is assumed that inter-core race condition will not occur. +#define LB_CONCURRENCY_TIMEOUT 10 + +lb_main_t lb_main; + +#define lb_get_writer_lock() do {} while(__sync_lock_test_and_set (lb_main.writer_lock, 1)) +#define lb_put_writer_lock() lb_main.writer_lock[0] = 0 + +static void lb_as_stack (lb_as_t *as); + + +const static char * const lb_dpo_gre4_ip4[] = { "lb4-gre4" , NULL }; +const static char * const lb_dpo_gre4_ip6[] = { "lb6-gre4" , NULL }; +const static char* const * const lb_dpo_gre4_nodes[DPO_PROTO_NUM] = + { + [DPO_PROTO_IP4] = lb_dpo_gre4_ip4, + [DPO_PROTO_IP6] = lb_dpo_gre4_ip6, + }; + +const static char * const lb_dpo_gre6_ip4[] = { "lb4-gre6" , NULL }; +const static char * const lb_dpo_gre6_ip6[] = { "lb6-gre6" , NULL }; +const static char* const * const lb_dpo_gre6_nodes[DPO_PROTO_NUM] = + { + [DPO_PROTO_IP4] = lb_dpo_gre6_ip4, + [DPO_PROTO_IP6] = lb_dpo_gre6_ip6, + }; + +u32 lb_hash_time_now(vlib_main_t * vm) +{ + return (u32) (vlib_time_now(vm) + 10000); +} + +u8 *format_lb_main (u8 * s, va_list * args) +{ + vlib_thread_main_t *tm = vlib_get_thread_main(); + lb_main_t *lbm = &lb_main; + s = format(s, "lb_main"); + s = format(s, " ip4-src-address: %U \n", format_ip4_address, &lbm->ip4_src_address); + s = format(s, " ip6-src-address: %U \n", format_ip6_address, &lbm->ip6_src_address); + s = format(s, " #vips: %u\n", pool_elts(lbm->vips)); + s = format(s, " #ass: %u\n", pool_elts(lbm->ass) - 1); + + u32 thread_index; + for(thread_index = 0; thread_index < tm->n_vlib_mains; thread_index++ ) { + lb_hash_t *h = lbm->per_cpu[thread_index].sticky_ht; + if (h) { + s = format(s, "core %d\n", thread_index); + s = format(s, " timeout: %ds\n", h->timeout); + s = format(s, " usage: %d / %d\n", lb_hash_elts(h, lb_hash_time_now(vlib_get_main())), lb_hash_size(h)); + } + } + + return s; +} + +static char *lb_vip_type_strings[] = { + [LB_VIP_TYPE_IP6_GRE6] = "ip6-gre6", + [LB_VIP_TYPE_IP6_GRE4] = "ip6-gre4", + [LB_VIP_TYPE_IP4_GRE6] = "ip4-gre6", + [LB_VIP_TYPE_IP4_GRE4] = "ip4-gre4", +}; + +u8 *format_lb_vip_type (u8 * s, va_list * args) +{ + lb_vip_type_t vipt = va_arg (*args, lb_vip_type_t); + u32 i; + for (i=0; i<LB_VIP_N_TYPES; i++) + if (vipt == i) + return format(s, lb_vip_type_strings[i]); + return format(s, "_WRONG_TYPE_"); +} + +uword unformat_lb_vip_type (unformat_input_t * input, va_list * args) +{ + lb_vip_type_t *vipt = va_arg (*args, lb_vip_type_t *); + u32 i; + for (i=0; i<LB_VIP_N_TYPES; i++) + if (unformat(input, lb_vip_type_strings[i])) { + *vipt = i; + return 1; + } + return 0; +} + +u8 *format_lb_vip (u8 * s, va_list * args) +{ + lb_vip_t *vip = va_arg (*args, lb_vip_t *); + return format(s, "%U %U new_size:%u #as:%u%s", + format_lb_vip_type, vip->type, + format_ip46_prefix, &vip->prefix, vip->plen, IP46_TYPE_ANY, + vip->new_flow_table_mask + 1, + pool_elts(vip->as_indexes), + (vip->flags & LB_VIP_FLAGS_USED)?"":" removed"); +} + +u8 *format_lb_as (u8 * s, va_list * args) +{ + lb_as_t *as = va_arg (*args, lb_as_t *); + return format(s, "%U %s", format_ip46_address, + &as->address, IP46_TYPE_ANY, + (as->flags & LB_AS_FLAGS_USED)?"used":"removed"); +} + +u8 *format_lb_vip_detailed (u8 * s, va_list * args) +{ + lb_main_t *lbm = &lb_main; + lb_vip_t *vip = va_arg (*args, lb_vip_t *); + uword indent = format_get_indent (s); + + s = format(s, "%U %U [%u] %U%s\n" + "%U new_size:%u\n", + format_white_space, indent, + format_lb_vip_type, vip->type, + vip - lbm->vips, format_ip46_prefix, &vip->prefix, vip->plen, IP46_TYPE_ANY, + (vip->flags & LB_VIP_FLAGS_USED)?"":" removed", + format_white_space, indent, + vip->new_flow_table_mask + 1); + + //Print counters + s = format(s, "%U counters:\n", + format_white_space, indent); + u32 i; + for (i=0; i<LB_N_VIP_COUNTERS; i++) + s = format(s, "%U %s: %d\n", + format_white_space, indent, + lbm->vip_counters[i].name, + vlib_get_simple_counter(&lbm->vip_counters[i], vip - lbm->vips)); + + + s = format(s, "%U #as:%u\n", + format_white_space, indent, + pool_elts(vip->as_indexes)); + + //Let's count the buckets for each AS + u32 *count = 0; + vec_validate(count, pool_len(lbm->ass)); //Possibly big alloc for not much... + lb_new_flow_entry_t *nfe; + vec_foreach(nfe, vip->new_flow_table) + count[nfe->as_index]++; + + lb_as_t *as; + u32 *as_index; + pool_foreach(as_index, vip->as_indexes, { + as = &lbm->ass[*as_index]; + s = format(s, "%U %U %d buckets %d flows dpo:%u %s\n", + format_white_space, indent, + format_ip46_address, &as->address, IP46_TYPE_ANY, + count[as - lbm->ass], + vlib_refcount_get(&lbm->as_refcount, as - lbm->ass), + as->dpo.dpoi_index, + (as->flags & LB_AS_FLAGS_USED)?"used":" removed"); + }); + + vec_free(count); + + /* + s = format(s, "%U new flows table:\n", format_white_space, indent); + lb_new_flow_entry_t *nfe; + vec_foreach(nfe, vip->new_flow_table) { + s = format(s, "%U %d: %d\n", format_white_space, indent, nfe - vip->new_flow_table, nfe->as_index); + } + */ + return s; +} + +typedef struct { + u32 as_index; + u32 last; + u32 skip; +} lb_pseudorand_t; + +static int lb_pseudorand_compare(void *a, void *b) +{ + lb_as_t *asa, *asb; + lb_main_t *lbm = &lb_main; + asa = &lbm->ass[((lb_pseudorand_t *)a)->as_index]; + asb = &lbm->ass[((lb_pseudorand_t *)b)->as_index]; + return memcmp(&asa->address, &asb->address, sizeof(asb->address)); +} + +static void lb_vip_garbage_collection(lb_vip_t *vip) +{ + lb_main_t *lbm = &lb_main; + ASSERT (lbm->writer_lock[0]); + + u32 now = (u32) vlib_time_now(vlib_get_main()); + if (!clib_u32_loop_gt(now, vip->last_garbage_collection + LB_GARBAGE_RUN)) + return; + + vip->last_garbage_collection = now; + lb_as_t *as; + u32 *as_index; + pool_foreach(as_index, vip->as_indexes, { + as = &lbm->ass[*as_index]; + if (!(as->flags & LB_AS_FLAGS_USED) && //Not used + clib_u32_loop_gt(now, as->last_used + LB_CONCURRENCY_TIMEOUT) && //Not recently used + (vlib_refcount_get(&lbm->as_refcount, as - lbm->ass) == 0)) + { //Not referenced + fib_entry_child_remove(as->next_hop_fib_entry_index, + as->next_hop_child_index); + fib_table_entry_delete_index(as->next_hop_fib_entry_index, + FIB_SOURCE_RR); + as->next_hop_fib_entry_index = FIB_NODE_INDEX_INVALID; + + pool_put(vip->as_indexes, as_index); + pool_put(lbm->ass, as); + } + }); +} + +void lb_garbage_collection() +{ + lb_main_t *lbm = &lb_main; + lb_get_writer_lock(); + lb_vip_t *vip; + u32 *to_be_removed_vips = 0, *i; + pool_foreach(vip, lbm->vips, { + lb_vip_garbage_collection(vip); + + if (!(vip->flags & LB_VIP_FLAGS_USED) && + (pool_elts(vip->as_indexes) == 0)) { + vec_add1(to_be_removed_vips, vip - lbm->vips); + } + }); + + vec_foreach(i, to_be_removed_vips) { + vip = &lbm->vips[*i]; + pool_put(lbm->vips, vip); + pool_free(vip->as_indexes); + } + + vec_free(to_be_removed_vips); + lb_put_writer_lock(); +} + +static void lb_vip_update_new_flow_table(lb_vip_t *vip) +{ + lb_main_t *lbm = &lb_main; + lb_new_flow_entry_t *old_table; + u32 i, *as_index; + lb_new_flow_entry_t *new_flow_table = 0; + lb_as_t *as; + lb_pseudorand_t *pr, *sort_arr = 0; + u32 count; + + ASSERT (lbm->writer_lock[0]); //We must have the lock + + //Check if some AS is configured or not + i = 0; + pool_foreach(as_index, vip->as_indexes, { + as = &lbm->ass[*as_index]; + if (as->flags & LB_AS_FLAGS_USED) { //Not used anymore + i = 1; + goto out; //Not sure 'break' works in this macro-loop + } + }); + +out: + if (i == 0) { + //Only the default. i.e. no AS + vec_validate(new_flow_table, vip->new_flow_table_mask); + for (i=0; i<vec_len(new_flow_table); i++) + new_flow_table[i].as_index = 0; + + goto finished; + } + + //First, let's sort the ASs + sort_arr = 0; + vec_alloc(sort_arr, pool_elts(vip->as_indexes)); + + i = 0; + pool_foreach(as_index, vip->as_indexes, { + as = &lbm->ass[*as_index]; + if (!(as->flags & LB_AS_FLAGS_USED)) //Not used anymore + continue; + + sort_arr[i].as_index = as - lbm->ass; + i++; + }); + _vec_len(sort_arr) = i; + + vec_sort_with_function(sort_arr, lb_pseudorand_compare); + + //Now let's pseudo-randomly generate permutations + vec_foreach(pr, sort_arr) { + lb_as_t *as = &lbm->ass[pr->as_index]; + + u64 seed = clib_xxhash(as->address.as_u64[0] ^ + as->address.as_u64[1]); + /* We have 2^n buckets. + * skip must be prime with 2^n. + * So skip must be odd. + * MagLev actually state that M should be prime, + * but this has a big computation cost (% operation). + * Using 2^n is more better (& operation). + */ + pr->skip = ((seed & 0xffffffff) | 1) & vip->new_flow_table_mask; + pr->last = (seed >> 32) & vip->new_flow_table_mask; + } + + //Let's create a new flow table + vec_validate(new_flow_table, vip->new_flow_table_mask); + for (i=0; i<vec_len(new_flow_table); i++) + new_flow_table[i].as_index = ~0; + + u32 done = 0; + while (1) { + vec_foreach(pr, sort_arr) { + while (1) { + u32 last = pr->last; + pr->last = (pr->last + pr->skip) & vip->new_flow_table_mask; + if (new_flow_table[last].as_index == ~0) { + new_flow_table[last].as_index = pr->as_index; + break; + } + } + done++; + if (done == vec_len(new_flow_table)) + goto finished; + } + } + + vec_free(sort_arr); + +finished: + +//Count number of changed entries + count = 0; + for (i=0; i<vec_len(new_flow_table); i++) + if (vip->new_flow_table == 0 || + new_flow_table[i].as_index != vip->new_flow_table[i].as_index) + count++; + + old_table = vip->new_flow_table; + vip->new_flow_table = new_flow_table; + vec_free(old_table); +} + +int lb_conf(ip4_address_t *ip4_address, ip6_address_t *ip6_address, + u32 per_cpu_sticky_buckets, u32 flow_timeout) +{ + lb_main_t *lbm = &lb_main; + + if (!is_pow2(per_cpu_sticky_buckets)) + return VNET_API_ERROR_INVALID_MEMORY_SIZE; + + lb_get_writer_lock(); //Not exactly necessary but just a reminder that it exists for my future self + lbm->ip4_src_address = *ip4_address; + lbm->ip6_src_address = *ip6_address; + lbm->per_cpu_sticky_buckets = per_cpu_sticky_buckets; + lbm->flow_timeout = flow_timeout; + lb_put_writer_lock(); + return 0; +} + +static +int lb_vip_find_index_with_lock(ip46_address_t *prefix, u8 plen, u32 *vip_index) +{ + lb_main_t *lbm = &lb_main; + lb_vip_t *vip; + ASSERT (lbm->writer_lock[0]); //This must be called with the lock owned + ip46_prefix_normalize(prefix, plen); + pool_foreach(vip, lbm->vips, { + if ((vip->flags & LB_AS_FLAGS_USED) && + vip->plen == plen && + vip->prefix.as_u64[0] == prefix->as_u64[0] && + vip->prefix.as_u64[1] == prefix->as_u64[1]) { + *vip_index = vip - lbm->vips; + return 0; + } + }); + return VNET_API_ERROR_NO_SUCH_ENTRY; +} + +int lb_vip_find_index(ip46_address_t *prefix, u8 plen, u32 *vip_index) +{ + int ret; + lb_get_writer_lock(); + ret = lb_vip_find_index_with_lock(prefix, plen, vip_index); + lb_put_writer_lock(); + return ret; +} + +static int lb_as_find_index_vip(lb_vip_t *vip, ip46_address_t *address, u32 *as_index) +{ + lb_main_t *lbm = &lb_main; + ASSERT (lbm->writer_lock[0]); //This must be called with the lock owned + lb_as_t *as; + u32 *asi; + pool_foreach(asi, vip->as_indexes, { + as = &lbm->ass[*asi]; + if (as->vip_index == (vip - lbm->vips) && + as->address.as_u64[0] == address->as_u64[0] && + as->address.as_u64[1] == address->as_u64[1]) { + *as_index = as - lbm->ass; + return 0; + } + }); + return -1; +} + +int lb_vip_add_ass(u32 vip_index, ip46_address_t *addresses, u32 n) +{ + lb_main_t *lbm = &lb_main; + lb_get_writer_lock(); + lb_vip_t *vip; + if (!(vip = lb_vip_get_by_index(vip_index))) { + lb_put_writer_lock(); + return VNET_API_ERROR_NO_SUCH_ENTRY; + } + + ip46_type_t type = lb_vip_is_gre4(vip)?IP46_TYPE_IP4:IP46_TYPE_IP6; + u32 *to_be_added = 0; + u32 *to_be_updated = 0; + u32 i; + u32 *ip; + + //Sanity check + while (n--) { + + if (!lb_as_find_index_vip(vip, &addresses[n], &i)) { + if (lbm->ass[i].flags & LB_AS_FLAGS_USED) { + vec_free(to_be_added); + vec_free(to_be_updated); + lb_put_writer_lock(); + return VNET_API_ERROR_VALUE_EXIST; + } + vec_add1(to_be_updated, i); + goto next; + } + + if (ip46_address_type(&addresses[n]) != type) { + vec_free(to_be_added); + vec_free(to_be_updated); + lb_put_writer_lock(); + return VNET_API_ERROR_INVALID_ADDRESS_FAMILY; + } + + if (n) { + u32 n2 = n; + while(n2--) //Check for duplicates + if (addresses[n2].as_u64[0] == addresses[n].as_u64[0] && + addresses[n2].as_u64[1] == addresses[n].as_u64[1]) + goto next; + } + + vec_add1(to_be_added, n); + +next: + continue; + } + + //Update reused ASs + vec_foreach(ip, to_be_updated) { + lbm->ass[*ip].flags = LB_AS_FLAGS_USED; + } + vec_free(to_be_updated); + + //Create those who have to be created + vec_foreach(ip, to_be_added) { + lb_as_t *as; + u32 *as_index; + pool_get(lbm->ass, as); + as->address = addresses[*ip]; + as->flags = LB_AS_FLAGS_USED; + as->vip_index = vip_index; + pool_get(vip->as_indexes, as_index); + *as_index = as - lbm->ass; + + /* + * become a child of the FIB entry + * so we are informed when its forwarding changes + */ + fib_prefix_t nh = {}; + if (lb_vip_is_gre4(vip)) { + nh.fp_addr.ip4 = as->address.ip4; + nh.fp_len = 32; + nh.fp_proto = FIB_PROTOCOL_IP4; + } else { + nh.fp_addr.ip6 = as->address.ip6; + nh.fp_len = 128; + nh.fp_proto = FIB_PROTOCOL_IP6; + } + + as->next_hop_fib_entry_index = + fib_table_entry_special_add(0, + &nh, + FIB_SOURCE_RR, + FIB_ENTRY_FLAG_NONE); + as->next_hop_child_index = + fib_entry_child_add(as->next_hop_fib_entry_index, + lbm->fib_node_type, + as - lbm->ass); + + lb_as_stack(as); + } + vec_free(to_be_added); + + //Recompute flows + lb_vip_update_new_flow_table(vip); + + //Garbage collection maybe + lb_vip_garbage_collection(vip); + + lb_put_writer_lock(); + return 0; +} + +int lb_vip_del_ass_withlock(u32 vip_index, ip46_address_t *addresses, u32 n) +{ + lb_main_t *lbm = &lb_main; + u32 now = (u32) vlib_time_now(vlib_get_main()); + u32 *ip = 0; + + lb_vip_t *vip; + if (!(vip = lb_vip_get_by_index(vip_index))) { + return VNET_API_ERROR_NO_SUCH_ENTRY; + } + + u32 *indexes = NULL; + while (n--) { + u32 i; + if (lb_as_find_index_vip(vip, &addresses[n], &i)) { + vec_free(indexes); + return VNET_API_ERROR_NO_SUCH_ENTRY; + } + + if (n) { //Check for duplicates + u32 n2 = n - 1; + while(n2--) { + if (addresses[n2].as_u64[0] == addresses[n].as_u64[0] && + addresses[n2].as_u64[1] == addresses[n].as_u64[1]) + goto next; + } + } + + vec_add1(indexes, i); +next: + continue; + } + + //Garbage collection maybe + lb_vip_garbage_collection(vip); + + if (indexes != NULL) { + vec_foreach(ip, indexes) { + lbm->ass[*ip].flags &= ~LB_AS_FLAGS_USED; + lbm->ass[*ip].last_used = now; + } + + //Recompute flows + lb_vip_update_new_flow_table(vip); + } + + vec_free(indexes); + return 0; +} + +int lb_vip_del_ass(u32 vip_index, ip46_address_t *addresses, u32 n) +{ + lb_get_writer_lock(); + int ret = lb_vip_del_ass_withlock(vip_index, addresses, n); + lb_put_writer_lock(); + return ret; +} + +/** + * Add the VIP adjacency to the ip4 or ip6 fib + */ +static void lb_vip_add_adjacency(lb_main_t *lbm, lb_vip_t *vip) +{ + dpo_proto_t proto = 0; + dpo_id_t dpo = DPO_INVALID; + fib_prefix_t pfx = {}; + if (lb_vip_is_ip4(vip)) { + pfx.fp_addr.ip4 = vip->prefix.ip4; + pfx.fp_len = vip->plen - 96; + pfx.fp_proto = FIB_PROTOCOL_IP4; + proto = DPO_PROTO_IP4; + } else { + pfx.fp_addr.ip6 = vip->prefix.ip6; + pfx.fp_len = vip->plen; + pfx.fp_proto = FIB_PROTOCOL_IP6; + proto = DPO_PROTO_IP6; + } + dpo_set(&dpo, lb_vip_is_gre4(vip)?lbm->dpo_gre4_type:lbm->dpo_gre6_type, + proto, vip - lbm->vips); + fib_table_entry_special_dpo_add(0, + &pfx, + FIB_SOURCE_PLUGIN_HI, + FIB_ENTRY_FLAG_EXCLUSIVE, + &dpo); + dpo_reset(&dpo); +} + +/** + * Deletes the adjacency associated with the VIP + */ +static void lb_vip_del_adjacency(lb_main_t *lbm, lb_vip_t *vip) +{ + fib_prefix_t pfx = {}; + if (lb_vip_is_ip4(vip)) { + pfx.fp_addr.ip4 = vip->prefix.ip4; + pfx.fp_len = vip->plen - 96; + pfx.fp_proto = FIB_PROTOCOL_IP4; + } else { + pfx.fp_addr.ip6 = vip->prefix.ip6; + pfx.fp_len = vip->plen; + pfx.fp_proto = FIB_PROTOCOL_IP6; + } + fib_table_entry_special_remove(0, &pfx, FIB_SOURCE_PLUGIN_HI); +} + +int lb_vip_add(ip46_address_t *prefix, u8 plen, lb_vip_type_t type, u32 new_length, u32 *vip_index) +{ + lb_main_t *lbm = &lb_main; + lb_vip_t *vip; + lb_get_writer_lock(); + ip46_prefix_normalize(prefix, plen); + + if (!lb_vip_find_index_with_lock(prefix, plen, vip_index)) { + lb_put_writer_lock(); + return VNET_API_ERROR_VALUE_EXIST; + } + + if (!is_pow2(new_length)) { + lb_put_writer_lock(); + return VNET_API_ERROR_INVALID_MEMORY_SIZE; + } + + if (ip46_prefix_is_ip4(prefix, plen) && + (type != LB_VIP_TYPE_IP4_GRE4) && + (type != LB_VIP_TYPE_IP4_GRE6)) + return VNET_API_ERROR_INVALID_ADDRESS_FAMILY; + + + //Allocate + pool_get(lbm->vips, vip); + + //Init + vip->prefix = *prefix; + vip->plen = plen; + vip->last_garbage_collection = (u32) vlib_time_now(vlib_get_main()); + vip->type = type; + vip->flags = LB_VIP_FLAGS_USED; + vip->as_indexes = 0; + + //Validate counters + u32 i; + for (i = 0; i < LB_N_VIP_COUNTERS; i++) { + vlib_validate_simple_counter(&lbm->vip_counters[i], vip - lbm->vips); + vlib_zero_simple_counter(&lbm->vip_counters[i], vip - lbm->vips); + } + + //Configure new flow table + vip->new_flow_table_mask = new_length - 1; + vip->new_flow_table = 0; + + //Create a new flow hash table full of the default entry + lb_vip_update_new_flow_table(vip); + + //Create adjacency to direct traffic + lb_vip_add_adjacency(lbm, vip); + + //Return result + *vip_index = vip - lbm->vips; + + lb_put_writer_lock(); + return 0; +} + +int lb_vip_del(u32 vip_index) +{ + lb_main_t *lbm = &lb_main; + lb_vip_t *vip; + lb_get_writer_lock(); + if (!(vip = lb_vip_get_by_index(vip_index))) { + lb_put_writer_lock(); + return VNET_API_ERROR_NO_SUCH_ENTRY; + } + + //FIXME: This operation is actually not working + //We will need to remove state before performing this. + + { + //Remove all ASs + ip46_address_t *ass = 0; + lb_as_t *as; + u32 *as_index; + pool_foreach(as_index, vip->as_indexes, { + as = &lbm->ass[*as_index]; + vec_add1(ass, as->address); + }); + if (vec_len(ass)) + lb_vip_del_ass_withlock(vip_index, ass, vec_len(ass)); + vec_free(ass); + } + + //Delete adjacency + lb_vip_del_adjacency(lbm, vip); + + //Set the VIP as unused + vip->flags &= ~LB_VIP_FLAGS_USED; + + lb_put_writer_lock(); + return 0; +} + +/* *INDENT-OFF* */ +VLIB_PLUGIN_REGISTER () = { + .version = VPP_BUILD_VER, + .description = "Load Balancer", +}; +/* *INDENT-ON* */ + +u8 *format_lb_dpo (u8 * s, va_list * va) +{ + index_t index = va_arg (*va, index_t); + CLIB_UNUSED(u32 indent) = va_arg (*va, u32); + lb_main_t *lbm = &lb_main; + lb_vip_t *vip = pool_elt_at_index (lbm->vips, index); + return format (s, "%U", format_lb_vip, vip); +} + +static void lb_dpo_lock (dpo_id_t *dpo) {} +static void lb_dpo_unlock (dpo_id_t *dpo) {} + +static fib_node_t * +lb_fib_node_get_node (fib_node_index_t index) +{ + lb_main_t *lbm = &lb_main; + lb_as_t *as = pool_elt_at_index (lbm->ass, index); + return (&as->fib_node); +} + +static void +lb_fib_node_last_lock_gone (fib_node_t *node) +{ +} + +static lb_as_t * +lb_as_from_fib_node (fib_node_t *node) +{ + return ((lb_as_t*)(((char*)node) - + STRUCT_OFFSET_OF(lb_as_t, fib_node))); +} + +static void +lb_as_stack (lb_as_t *as) +{ + lb_main_t *lbm = &lb_main; + lb_vip_t *vip = &lbm->vips[as->vip_index]; + dpo_stack(lb_vip_is_gre4(vip)?lbm->dpo_gre4_type:lbm->dpo_gre6_type, + lb_vip_is_ip4(vip)?DPO_PROTO_IP4:DPO_PROTO_IP6, + &as->dpo, + fib_entry_contribute_ip_forwarding( + as->next_hop_fib_entry_index)); +} + +static fib_node_back_walk_rc_t +lb_fib_node_back_walk_notify (fib_node_t *node, + fib_node_back_walk_ctx_t *ctx) +{ + lb_as_stack(lb_as_from_fib_node(node)); + return (FIB_NODE_BACK_WALK_CONTINUE); +} + +clib_error_t * +lb_init (vlib_main_t * vm) +{ + vlib_thread_main_t *tm = vlib_get_thread_main (); + lb_main_t *lbm = &lb_main; + lb_as_t *default_as; + fib_node_vft_t lb_fib_node_vft = { + .fnv_get = lb_fib_node_get_node, + .fnv_last_lock = lb_fib_node_last_lock_gone, + .fnv_back_walk = lb_fib_node_back_walk_notify, + }; + dpo_vft_t lb_vft = { + .dv_lock = lb_dpo_lock, + .dv_unlock = lb_dpo_unlock, + .dv_format = format_lb_dpo, + }; + + lbm->vips = 0; + lbm->per_cpu = 0; + vec_validate(lbm->per_cpu, tm->n_vlib_mains - 1); + lbm->writer_lock = clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES, CLIB_CACHE_LINE_BYTES); + lbm->writer_lock[0] = 0; + lbm->per_cpu_sticky_buckets = LB_DEFAULT_PER_CPU_STICKY_BUCKETS; + lbm->flow_timeout = LB_DEFAULT_FLOW_TIMEOUT; + lbm->ip4_src_address.as_u32 = 0xffffffff; + lbm->ip6_src_address.as_u64[0] = 0xffffffffffffffffL; + lbm->ip6_src_address.as_u64[1] = 0xffffffffffffffffL; + lbm->dpo_gre4_type = dpo_register_new_type(&lb_vft, lb_dpo_gre4_nodes); + lbm->dpo_gre6_type = dpo_register_new_type(&lb_vft, lb_dpo_gre6_nodes); + lbm->fib_node_type = fib_node_register_new_type(&lb_fib_node_vft); + + //Init AS reference counters + vlib_refcount_init(&lbm->as_refcount); + + //Allocate and init default AS. + lbm->ass = 0; + pool_get(lbm->ass, default_as); + default_as->flags = 0; + default_as->dpo.dpoi_next_node = LB_NEXT_DROP; + default_as->vip_index = ~0; + default_as->address.ip6.as_u64[0] = 0xffffffffffffffffL; + default_as->address.ip6.as_u64[1] = 0xffffffffffffffffL; + +#define _(a,b,c) lbm->vip_counters[c].name = b; + lb_foreach_vip_counter +#undef _ + return NULL; +} + +VLIB_INIT_FUNCTION (lb_init); diff --git a/src/plugins/lb/lb.h b/src/plugins/lb/lb.h new file mode 100644 index 00000000..882b9b30 --- /dev/null +++ b/src/plugins/lb/lb.h @@ -0,0 +1,333 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * lb-plugin implements a MagLev-like load balancer. + * http://research.google.com/pubs/pub44824.html + * + * It hasn't been tested for interoperability with the original MagLev + * but intends to provide similar functionality. + * The load-balancer receives traffic destined to VIP (Virtual IP) + * addresses from one or multiple(ECMP) routers. + * The load-balancer tunnels the traffic toward many application servers + * ensuring session stickyness (i.e. that a single sessions is tunneled + * towards a single application server). + * + */ + +#ifndef LB_PLUGIN_LB_LB_H_ +#define LB_PLUGIN_LB_LB_H_ + +#include <lb/util.h> +#include <lb/refcount.h> + +#include <vnet/vnet.h> +#include <vnet/ip/ip.h> +#include <vnet/dpo/dpo.h> +#include <vnet/fib/fib_table.h> + +#include <lb/lbhash.h> + +#define LB_DEFAULT_PER_CPU_STICKY_BUCKETS 1 << 10 +#define LB_DEFAULT_FLOW_TIMEOUT 40 + +typedef enum { + LB_NEXT_DROP, + LB_N_NEXT, +} lb_next_t; + +/** + * Each VIP is configured with a set of + * application server. + */ +typedef struct { + /** + * Registration to FIB event. + */ + fib_node_t fib_node; + + /** + * Destination address used to tunnel traffic towards + * that application server. + * The address is also used as ID and pseudo-random + * seed for the load-balancing process. + */ + ip46_address_t address; + + /** + * ASs are indexed by address and VIP Index. + * Which means there will be duplicated if the same server + * address is used for multiple VIPs. + */ + u32 vip_index; + + /** + * Some per-AS flags. + * For now only LB_AS_FLAGS_USED is defined. + */ + u8 flags; + +#define LB_AS_FLAGS_USED 0x1 + + /** + * Rotating timestamp of when LB_AS_FLAGS_USED flag was last set. + * + * AS removal is based on garbage collection and reference counting. + * When an AS is removed, there is a race between configuration core + * and worker cores which may still add a reference while it should not + * be used. This timestamp is used to not remove the AS while a race condition + * may happen. + */ + u32 last_used; + + /** + * The FIB entry index for the next-hop + */ + fib_node_index_t next_hop_fib_entry_index; + + /** + * The child index on the FIB entry + */ + u32 next_hop_child_index; + + /** + * The next DPO in the graph to follow. + */ + dpo_id_t dpo; + +} lb_as_t; + +format_function_t format_lb_as; + +typedef struct { + u32 as_index; +} lb_new_flow_entry_t; + +#define lb_foreach_vip_counter \ + _(NEXT_PACKET, "packet from existing sessions", 0) \ + _(FIRST_PACKET, "first session packet", 1) \ + _(UNTRACKED_PACKET, "untracked packet", 2) \ + _(NO_SERVER, "no server configured", 3) + +typedef enum { +#define _(a,b,c) LB_VIP_COUNTER_##a = c, + lb_foreach_vip_counter +#undef _ + LB_N_VIP_COUNTERS +} lb_vip_counter_t; + +/** + * The load balancer supports IPv4 and IPv6 traffic + * and GRE4 and GRE6 encap. + */ +typedef enum { + LB_VIP_TYPE_IP6_GRE6, + LB_VIP_TYPE_IP6_GRE4, + LB_VIP_TYPE_IP4_GRE6, + LB_VIP_TYPE_IP4_GRE4, + LB_VIP_N_TYPES, +} lb_vip_type_t; + +format_function_t format_lb_vip_type; +unformat_function_t unformat_lb_vip_type; + +/** + * Load balancing service is provided per VIP. + * In this data model, a VIP can be a whole prefix. + * But load balancing only + * occurs on a per-source-address/port basis. Meaning that if a given source + * reuses the same port for multiple destinations within the same VIP, + * they will be considered as a single flow. + */ +typedef struct { + + //Runtime + + /** + * Vector mapping (flow-hash & new_connect_table_mask) to AS index. + * This is used for new flows. + */ + lb_new_flow_entry_t *new_flow_table; + + /** + * New flows table length - 1 + * (length MUST be a power of 2) + */ + u32 new_flow_table_mask; + + /** + * Last time garbage collection was run to free the ASs. + */ + u32 last_garbage_collection; + + //Not runtime + + /** + * A Virtual IP represents a given service delivered + * by a set of application servers. It can be a single + * address or a prefix. + * IPv4 prefixes are encoded using IPv4-in-IPv6 embedded address + * (i.e. ::/96 prefix). + */ + ip46_address_t prefix; + + /** + * The VIP prefix length. + * In case of IPv4, plen = 96 + ip4_plen. + */ + u8 plen; + + /** + * The type of traffic for this. + * LB_TYPE_UNDEFINED if unknown. + */ + lb_vip_type_t type; + + /** + * Flags related to this VIP. + * LB_VIP_FLAGS_USED means the VIP is active. + * When it is not set, the VIP in the process of being removed. + * We cannot immediately remove a VIP because the VIP index still may be stored + * in the adjacency index. + */ + u8 flags; +#define LB_VIP_FLAGS_USED 0x1 + + /** + * Pool of AS indexes used for this VIP. + * This also includes ASs that have been removed (but are still referenced). + */ + u32 *as_indexes; +} lb_vip_t; + +#define lb_vip_is_ip4(vip) ((vip)->type == LB_VIP_TYPE_IP4_GRE6 || (vip)->type == LB_VIP_TYPE_IP4_GRE4) +#define lb_vip_is_gre4(vip) ((vip)->type == LB_VIP_TYPE_IP6_GRE4 || (vip)->type == LB_VIP_TYPE_IP4_GRE4) +format_function_t format_lb_vip; +format_function_t format_lb_vip_detailed; + +typedef struct { + /** + * Each CPU has its own sticky flow hash table. + * One single table is used for all VIPs. + */ + lb_hash_t *sticky_ht; +} lb_per_cpu_t; + +typedef struct { + /** + * Pool of all Virtual IPs + */ + lb_vip_t *vips; + + /** + * Pool of ASs. + * ASs are referenced by address and vip index. + * The first element (index 0) is special and used only to fill + * new_flow_tables when no AS has been configured. + */ + lb_as_t *ass; + + /** + * Each AS has an associated reference counter. + * As ass[0] has a special meaning, its associated counter + * starts at 0 and is decremented instead. i.e. do not use it. + */ + vlib_refcount_t as_refcount; + + /** + * Some global data is per-cpu + */ + lb_per_cpu_t *per_cpu; + + /** + * Node next index for IP adjacencies, for each of the traffic types. + */ + u32 ip_lookup_next_index[LB_VIP_N_TYPES]; + + /** + * Source address used in IPv6 encapsulated traffic + */ + ip6_address_t ip6_src_address; + + /** + * Source address used for IPv4 encapsulated traffic + */ + ip4_address_t ip4_src_address; + + /** + * Number of buckets in the per-cpu sticky hash table. + */ + u32 per_cpu_sticky_buckets; + + /** + * Flow timeout in seconds. + */ + u32 flow_timeout; + + /** + * Per VIP counter + */ + vlib_simple_counter_main_t vip_counters[LB_N_VIP_COUNTERS]; + + /** + * DPO used to send packet from IP4/6 lookup to LB node. + */ + dpo_type_t dpo_gre4_type; + dpo_type_t dpo_gre6_type; + + /** + * Node type for registering to fib changes. + */ + fib_node_type_t fib_node_type; + + /** + * API dynamically registered base ID. + */ + u16 msg_id_base; + + volatile u32 *writer_lock; +} lb_main_t; + +extern lb_main_t lb_main; +extern vlib_node_registration_t lb6_node; +extern vlib_node_registration_t lb4_node; + +/** + * Fix global load-balancer parameters. + * @param ip4_address IPv4 source address used for encapsulated traffic + * @param ip6_address IPv6 source address used for encapsulated traffic + * @return 0 on success. VNET_LB_ERR_XXX on error + */ +int lb_conf(ip4_address_t *ip4_address, ip6_address_t *ip6_address, + u32 sticky_buckets, u32 flow_timeout); + +int lb_vip_add(ip46_address_t *prefix, u8 plen, lb_vip_type_t type, + u32 new_length, u32 *vip_index); +int lb_vip_del(u32 vip_index); + +int lb_vip_find_index(ip46_address_t *prefix, u8 plen, u32 *vip_index); + +#define lb_vip_get_by_index(index) (pool_is_free_index(lb_main.vips, index)?NULL:pool_elt_at_index(lb_main.vips, index)) + +int lb_vip_add_ass(u32 vip_index, ip46_address_t *addresses, u32 n); +int lb_vip_del_ass(u32 vip_index, ip46_address_t *addresses, u32 n); + +u32 lb_hash_time_now(vlib_main_t * vm); + +void lb_garbage_collection(); + +format_function_t format_lb_main; + +#endif /* LB_PLUGIN_LB_LB_H_ */ diff --git a/src/plugins/lb/lb_plugin_doc.md b/src/plugins/lb/lb_plugin_doc.md new file mode 100644 index 00000000..c7885ffb --- /dev/null +++ b/src/plugins/lb/lb_plugin_doc.md @@ -0,0 +1,141 @@ +# Load Balancer plugin for VPP {#lb_plugin_doc} + +## Version + +The load balancer plugin is currently in *beta* version. +Both CLIs and APIs are subject to *heavy* changes. +Wich also means feedback is really welcome regarding features, apis, etc... + +## Overview + +This plugin provides load balancing for VPP in a way that is largely inspired +from Google's MagLev: http://research.google.com/pubs/pub44824.html + +The load balancer is configured with a set of Virtual IPs (VIP, which can be +prefixes), and for each VIP, with a set of Application Server addresses (ASs). + +Traffic received for a given VIP (or VIP prefix) is tunneled using GRE towards +the different ASs in a way that (tries to) ensure that a given session will +always be tunneled to the same AS. + +Both VIPs or ASs can be IPv4 or IPv6, but for a given VIP, all ASs must be using +the same encap. type (i.e. IPv4+GRE or IPv6+GRE). Meaning that for a given VIP, +all AS addresses must be of the same family. + +## Performances + +The load balancer has been tested up to 1 millions flows and still forwards more +than 3Mpps per core in such circumstances. +Although 3Mpps seems already good, it is likely that performances will be improved +in next versions. + +## Configuration + +### Global LB parameters + +The load balancer needs to be configured with some parameters: + + lb conf [ip4-src-address <addr>] [ip6-src-address <addr>] + [buckets <n>] [timeout <s>] + +ip4-src-address: the source address used to send encap. packets using IPv4. + +ip6-src-address: the source address used to send encap. packets using IPv6. + +buckets: the *per-thread* established-connexions-table number of buckets. + +timeout: the number of seconds a connection will remain in the + established-connexions-table while no packet for this flow + is received. + + +### Configure the VIPs + + lb vip <prefix> [encap (gre6|gre4)] [new_len <n>] [del] + +new_len is the size of the new-connection-table. It should be 1 or 2 orders of +magnitude bigger than the number of ASs for the VIP in order to ensure a good +load balancing. + +Examples: + + lb vip 2002::/16 encap gre6 new_len 1024 + lb vip 2003::/16 encap gre4 new_len 2048 + lb vip 80.0.0.0/8 encap gre6 new_len 16 + lb vip 90.0.0.0/8 encap gre4 new_len 1024 + +### Configure the ASs (for each VIP) + + lb as <vip-prefix> [<address> [<address> [...]]] [del] + +You can add (or delete) as many ASs at a time (for a single VIP). +Note that the AS address family must correspond to the VIP encap. IP family. + +Examples: + + lb as 2002::/16 2001::2 2001::3 2001::4 + lb as 2003::/16 10.0.0.1 10.0.0.2 + lb as 80.0.0.0/8 2001::2 + lb as 90.0.0.0/8 10.0.0.1 + + + +## Monitoring + +The plugin provides quite a bunch of counters and information. +These are still subject to quite significant changes. + + show lb + show lb vip + show lb vip verbose + + show node counters + + +## Design notes + +### Multi-Threading + +MagLev is a distributed system which pseudo-randomly generates a +new-connections-table based on AS names such that each server configured with +the same set of ASs ends up with the same table. Connection stickyness is then +ensured with an established-connections-table. Using ECMP, it is assumed (but +not relied on) that servers will mostly receive traffic for different flows. + +This implementation pushes the parallelism a little bit further by using +one established-connections table per thread. This is equivalent to assuming +that RSS will make a job similar to ECMP, and is pretty useful as threads don't +need to get a lock in order to write in the table. + +### Hash Table + +A load balancer requires an efficient read and write hash table. The hash table +used by ip6-forward is very read-efficient, but not so much for writing. In +addition, it is not a big deal if writing into the hash table fails (again, +MagLev uses a flow table but does not heaviliy relies on it). + +The plugin therefore uses a very specific (and stupid) hash table. + - Fixed (and power of 2) number of buckets (configured at runtime) + - Fixed (and power of 2) elements per buckets (configured at compilation time) + +### Reference counting + +When an AS is removed, there is two possible ways to react. + - Keep using the AS for established connections + - Change AS for established connections (likely to cause error for TCP) + +In the first case, although an AS is removed from the configuration, its +associated state needs to stay around as long as it is used by at least one +thread. + +In order to avoid locks, a specific reference counter is used. The design is quite +similar to clib counters but: + - It is possible to decrease the value + - Summing will not zero the per-thread counters + - Only the thread can reallocate its own counters vector (to avoid concurrency issues) + +This reference counter is lock free, but reading a count of 0 does not mean +the value can be freed unless it is ensured by *other* means that no other thread +is concurrently referencing the object. In the case of this plugin, it is assumed +that no concurrent event will take place after a few seconds. + diff --git a/src/plugins/lb/lb_test.c b/src/plugins/lb/lb_test.c new file mode 100644 index 00000000..9b30c18d --- /dev/null +++ b/src/plugins/lb/lb_test.c @@ -0,0 +1,270 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vat/vat.h> +#include <vlibapi/api.h> +#include <vlibmemory/api.h> +#include <vlibsocket/api.h> +#include <vppinfra/error.h> +#include <lb/lb.h> + +#define __plugin_msg_base lb_test_main.msg_id_base +#include <vlibapi/vat_helper_macros.h> + +//TODO: Move that to vat/plugin_api.c +////////////////////////// +uword unformat_ip46_address (unformat_input_t * input, va_list * args) +{ + ip46_address_t *ip46 = va_arg (*args, ip46_address_t *); + ip46_type_t type = va_arg (*args, ip46_type_t); + if ((type != IP46_TYPE_IP6) && + unformat(input, "%U", unformat_ip4_address, &ip46->ip4)) { + ip46_address_mask_ip4(ip46); + return 1; + } else if ((type != IP46_TYPE_IP4) && + unformat(input, "%U", unformat_ip6_address, &ip46->ip6)) { + return 1; + } + return 0; +} +uword unformat_ip46_prefix (unformat_input_t * input, va_list * args) +{ + ip46_address_t *ip46 = va_arg (*args, ip46_address_t *); + u8 *len = va_arg (*args, u8 *); + ip46_type_t type = va_arg (*args, ip46_type_t); + + u32 l; + if ((type != IP46_TYPE_IP6) && unformat(input, "%U/%u", unformat_ip4_address, &ip46->ip4, &l)) { + if (l > 32) + return 0; + *len = l + 96; + ip46->pad[0] = ip46->pad[1] = ip46->pad[2] = 0; + } else if ((type != IP46_TYPE_IP4) && unformat(input, "%U/%u", unformat_ip6_address, &ip46->ip6, &l)) { + if (l > 128) + return 0; + *len = l; + } else { + return 0; + } + return 1; +} +///////////////////////// + +#define vl_msg_id(n,h) n, +typedef enum { +#include <lb/lb.api.h> + /* We'll want to know how many messages IDs we need... */ + VL_MSG_FIRST_AVAILABLE, +} vl_msg_id_t; +#undef vl_msg_id + +/* define message structures */ +#define vl_typedefs +#include <lb/lb.api.h> +#undef vl_typedefs + +/* declare message handlers for each api */ + +#define vl_endianfun /* define message structures */ +#include <lb/lb.api.h> +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) +#define vl_printfun +#include <lb/lb.api.h> +#undef vl_printfun + +/* Get the API version number. */ +#define vl_api_version(n,v) static u32 api_version=(v); +#include <lb/lb.api.h> +#undef vl_api_version + +typedef struct { + /* API message ID base */ + u16 msg_id_base; + vat_main_t *vat_main; +} lb_test_main_t; + +lb_test_main_t lb_test_main; + +#define foreach_standard_reply_retval_handler \ +_(lb_conf_reply) \ +_(lb_add_del_vip_reply) \ +_(lb_add_del_as_reply) + +#define _(n) \ + static void vl_api_##n##_t_handler \ + (vl_api_##n##_t * mp) \ + { \ + vat_main_t * vam = lb_test_main.vat_main; \ + i32 retval = ntohl(mp->retval); \ + if (vam->async_mode) { \ + vam->async_errors += (retval < 0); \ + } else { \ + vam->retval = retval; \ + vam->result_ready = 1; \ + } \ + } +foreach_standard_reply_retval_handler; +#undef _ + +/* + * Table of message reply handlers, must include boilerplate handlers + * we just generated + */ +#define foreach_vpe_api_reply_msg \ + _(LB_CONF_REPLY, lb_conf_reply) \ + _(LB_ADD_DEL_VIP_REPLY, lb_add_del_vip_reply) \ + _(LB_ADD_DEL_AS_REPLY, lb_add_del_as_reply) + +static int api_lb_conf (vat_main_t * vam) +{ + unformat_input_t *i = vam->input; + vl_api_lb_conf_t mps, *mp; + int ret; + + if (!unformat(i, "%U %U %u %u", + unformat_ip4_address, &mps.ip4_src_address, + unformat_ip6_address, mps.ip6_src_address, + &mps.sticky_buckets_per_core, + &mps.flow_timeout)) { + errmsg ("invalid arguments\n"); + return -99; + } + + M(LB_CONF, mp); + S(mp); + W (ret); + return ret; +} + +static int api_lb_add_del_vip (vat_main_t * vam) +{ + unformat_input_t * i = vam->input; + vl_api_lb_add_del_vip_t mps, *mp; + int ret; + mps.is_del = 0; + mps.is_gre4 = 0; + + if (!unformat(i, "%U", + unformat_ip46_prefix, mps.ip_prefix, &mps.prefix_length, IP46_TYPE_ANY)) { + errmsg ("invalid prefix\n"); + return -99; + } + + if (unformat(i, "gre4")) { + mps.is_gre4 = 1; + } else if (unformat(i, "gre6")) { + mps.is_gre4 = 0; + } else { + errmsg ("no encap\n"); + return -99; + } + + if (!unformat(i, "%d", &mps.new_flows_table_length)) { + errmsg ("no table lentgh\n"); + return -99; + } + + if (unformat(i, "del")) { + mps.is_del = 1; + } + + M(LB_ADD_DEL_VIP, mp); + S(mp); + W (ret); + return ret; +} + +static int api_lb_add_del_as (vat_main_t * vam) +{ + unformat_input_t * i = vam->input; + vl_api_lb_add_del_as_t mps, *mp; + int ret; + mps.is_del = 0; + + if (!unformat(i, "%U %U", + unformat_ip46_prefix, mps.vip_ip_prefix, &mps.vip_prefix_length, IP46_TYPE_ANY, + unformat_ip46_address, mps.as_address)) { + errmsg ("invalid prefix or address\n"); + return -99; + } + + if (unformat(i, "del")) { + mps.is_del = 1; + } + + M(LB_ADD_DEL_AS, mp); + S(mp); + W (ret); + return ret; +} + +/* + * List of messages that the api test plugin sends, + * and that the data plane plugin processes + */ +#define foreach_vpe_api_msg \ +_(lb_conf, "<ip4-src-addr> <ip6-src-address> <sticky_buckets_per_core> <flow_timeout>") \ +_(lb_add_del_vip, "<ip-prefix> [gre4|gre6] <new_table_len> [del]") \ +_(lb_add_del_as, "<vip-ip-prefix> <address> [del]") + +static void +lb_vat_api_hookup (vat_main_t *vam) +{ + lb_test_main_t * lbtm = &lb_test_main; + /* Hook up handlers for replies from the data plane plug-in */ +#define _(N,n) \ + vl_msg_api_set_handlers((VL_API_##N + lbtm->msg_id_base), \ + #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_vpe_api_reply_msg; +#undef _ + + /* API messages we can send */ +#define _(n,h) hash_set_mem (vam->function_by_name, #n, api_##n); + foreach_vpe_api_msg; +#undef _ + + /* Help strings */ +#define _(n,h) hash_set_mem (vam->help_by_name, #n, h); + foreach_vpe_api_msg; +#undef _ +} + +clib_error_t * vat_plugin_register (vat_main_t *vam) +{ + lb_test_main_t * lbtm = &lb_test_main; + + u8 * name; + + lbtm->vat_main = vam; + + /* Ask the vpp engine for the first assigned message-id */ + name = format (0, "lb_%08x%c", api_version, 0); + lbtm->msg_id_base = vl_client_get_first_plugin_msg_id ((char *) name); + + if (lbtm->msg_id_base != (u16) ~0) + lb_vat_api_hookup (vam); + + vec_free(name); + + return 0; +} diff --git a/src/plugins/lb/lbhash.h b/src/plugins/lb/lbhash.h new file mode 100644 index 00000000..c514fb57 --- /dev/null +++ b/src/plugins/lb/lbhash.h @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2012 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * vppinfra already includes tons of different hash tables. + * MagLev flow table is a bit different. It has to be very efficient + * for both writing and reading operations. But it does not need to + * be 100% reliable (write can fail). It also needs to recycle + * old entries in a lazy way. + * + * This hash table is the most dummy hash table you can do. + * Fixed total size, fixed bucket size. + * Advantage is that it could be very efficient (maybe). + * + */ + +#ifndef LB_PLUGIN_LB_LBHASH_H_ +#define LB_PLUGIN_LB_LBHASH_H_ + +#include <vnet/vnet.h> + +#if defined (__SSE4_2__) +#include <immintrin.h> +#endif + +/* + * @brief Number of entries per bucket. + */ +#define LBHASH_ENTRY_PER_BUCKET 4 + +#define LB_HASH_DO_NOT_USE_SSE_BUCKETS 0 + +/* + * @brief One bucket contains 4 entries. + * Each bucket takes one 64B cache line in memory. + */ +typedef struct { + CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); + u32 hash[LBHASH_ENTRY_PER_BUCKET]; + u32 timeout[LBHASH_ENTRY_PER_BUCKET]; + u32 vip[LBHASH_ENTRY_PER_BUCKET]; + u32 value[LBHASH_ENTRY_PER_BUCKET]; +} lb_hash_bucket_t; + +typedef struct { + u32 buckets_mask; + u32 timeout; + lb_hash_bucket_t buckets[]; +} lb_hash_t; + +#define lb_hash_nbuckets(h) (((h)->buckets_mask) + 1) +#define lb_hash_size(h) ((h)->buckets_mask + LBHASH_ENTRY_PER_BUCKET) + +#define lb_hash_foreach_bucket(h, bucket) \ + for (bucket = (h)->buckets; \ + bucket < (h)->buckets + lb_hash_nbuckets(h); \ + bucket++) + +#define lb_hash_foreach_entry(h, bucket, i) \ + lb_hash_foreach_bucket(h, bucket) \ + for (i = 0; i < LBHASH_ENTRY_PER_BUCKET; i++) + +#define lb_hash_foreach_valid_entry(h, bucket, i, now) \ + lb_hash_foreach_entry(h, bucket, i) \ + if (!clib_u32_loop_gt((now), bucket->timeout[i])) + +static_always_inline +lb_hash_t *lb_hash_alloc(u32 buckets, u32 timeout) +{ + if (!is_pow2(buckets)) + return NULL; + + // Allocate 1 more bucket for prefetch + u32 size = ((u64)&((lb_hash_t *)(0))->buckets[0]) + + sizeof(lb_hash_bucket_t) * (buckets + 1); + u8 *mem = 0; + lb_hash_t *h; + vec_alloc_aligned(mem, size, CLIB_CACHE_LINE_BYTES); + h = (lb_hash_t *)mem; + h->buckets_mask = (buckets - 1); + h->timeout = timeout; + return h; +} + +static_always_inline +void lb_hash_free(lb_hash_t *h) +{ + u8 *mem = (u8 *)h; + vec_free(mem); +} + +#if __SSE4_2__ && !defined (__i386__) +static_always_inline +u32 lb_hash_hash(u64 k0, u64 k1, u64 k2, u64 k3, u64 k4) +{ + u64 val = 0; + val = _mm_crc32_u64(val, k0); + val = _mm_crc32_u64(val, k1); + val = _mm_crc32_u64(val, k2); + val = _mm_crc32_u64(val, k3); + val = _mm_crc32_u64(val, k4); + return (u32) val; +} +#else +static_always_inline +u32 lb_hash_hash(u64 k0, u64 k1, u64 k2, u64 k3, u64 k4) +{ + u64 tmp = k0 ^ k1 ^ k2 ^ k3 ^ k4; + return (u32)clib_xxhash (tmp); +} +#endif + +static_always_inline +void lb_hash_prefetch_bucket(lb_hash_t *ht, u32 hash) +{ + lb_hash_bucket_t *bucket = &ht->buckets[hash & ht->buckets_mask]; + CLIB_PREFETCH(bucket, sizeof(*bucket), READ); +} + +static_always_inline +void lb_hash_get(lb_hash_t *ht, u32 hash, u32 vip, u32 time_now, + u32 *available_index, u32 *found_value) +{ + lb_hash_bucket_t *bucket = &ht->buckets[hash & ht->buckets_mask]; + *found_value = ~0; + *available_index = ~0; +#if __SSE4_2__ && LB_HASH_DO_NOT_USE_SSE_BUCKETS == 0 + u32 bitmask, found_index; + __m128i mask; + + // mask[*] = timeout[*] > now + mask = _mm_cmpgt_epi32(_mm_loadu_si128 ((__m128i *) bucket->timeout), + _mm_set1_epi32 (time_now)); + // bitmask[*] = now <= timeout[*/4] + bitmask = (~_mm_movemask_epi8(mask)) & 0xffff; + // Get first index with now <= timeout[*], if any. + *available_index = (bitmask)?__builtin_ctz(bitmask)/4:*available_index; + + // mask[*] = (timeout[*] > now) && (hash[*] == hash) + mask = _mm_and_si128(mask, + _mm_cmpeq_epi32( + _mm_loadu_si128 ((__m128i *) bucket->hash), + _mm_set1_epi32 (hash))); + + // Load the array of vip values + // mask[*] = (timeout[*] > now) && (hash[*] == hash) && (vip[*] == vip) + mask = _mm_and_si128(mask, + _mm_cmpeq_epi32( + _mm_loadu_si128 ((__m128i *) bucket->vip), + _mm_set1_epi32 (vip))); + + // mask[*] = (timeout[*x4] > now) && (hash[*x4] == hash) && (vip[*x4] == vip) + bitmask = _mm_movemask_epi8(mask); + // Get first index, if any + found_index = (bitmask)?__builtin_ctzll(bitmask)/4:0; + ASSERT(found_index < 4); + *found_value = (bitmask)?bucket->value[found_index]:*found_value; + bucket->timeout[found_index] = + (bitmask)?time_now + ht->timeout:bucket->timeout[found_index]; +#else + u32 i; + for (i = 0; i < LBHASH_ENTRY_PER_BUCKET; i++) { + u8 cmp = (bucket->hash[i] == hash && bucket->vip[i] == vip); + u8 timeouted = clib_u32_loop_gt(time_now, bucket->timeout[i]); + *found_value = (cmp || timeouted)?*found_value:bucket->value[i]; + bucket->timeout[i] = (cmp || timeouted)?time_now + ht->timeout:bucket->timeout[i]; + *available_index = (timeouted && (*available_index == ~0))?i:*available_index; + + if (!cmp) + return; + } +#endif +} + +static_always_inline +u32 lb_hash_available_value(lb_hash_t *h, u32 hash, u32 available_index) +{ + return h->buckets[hash & h->buckets_mask].value[available_index]; +} + +static_always_inline +void lb_hash_put(lb_hash_t *h, u32 hash, u32 value, u32 vip, + u32 available_index, u32 time_now) +{ + lb_hash_bucket_t *bucket = &h->buckets[hash & h->buckets_mask]; + bucket->hash[available_index] = hash; + bucket->value[available_index] = value; + bucket->timeout[available_index] = time_now + h->timeout; + bucket->vip[available_index] = vip; +} + +static_always_inline +u32 lb_hash_elts(lb_hash_t *h, u32 time_now) +{ + u32 tot = 0; + lb_hash_bucket_t *bucket; + u32 i; + lb_hash_foreach_valid_entry(h, bucket, i, time_now) { + tot++; + } + return tot; +} + +#endif /* LB_PLUGIN_LB_LBHASH_H_ */ diff --git a/src/plugins/lb/node.c b/src/plugins/lb/node.c new file mode 100644 index 00000000..4a7485eb --- /dev/null +++ b/src/plugins/lb/node.c @@ -0,0 +1,421 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <lb/lb.h> + +#include <vnet/gre/packet.h> +#include <lb/lbhash.h> + +#define foreach_lb_error \ + _(NONE, "no error") \ + _(PROTO_NOT_SUPPORTED, "protocol not supported") + +typedef enum { +#define _(sym,str) LB_ERROR_##sym, + foreach_lb_error +#undef _ + LB_N_ERROR, +} lb_error_t; + +static char *lb_error_strings[] = { +#define _(sym,string) string, + foreach_lb_error +#undef _ +}; + +typedef struct { + u32 vip_index; + u32 as_index; +} lb_trace_t; + +u8 * +format_lb_trace (u8 * s, va_list * args) +{ + lb_main_t *lbm = &lb_main; + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + lb_trace_t *t = va_arg (*args, lb_trace_t *); + if (pool_is_free_index(lbm->vips, t->vip_index)) { + s = format(s, "lb vip[%d]: This VIP was freed since capture\n"); + } else { + s = format(s, "lb vip[%d]: %U\n", t->vip_index, format_lb_vip, &lbm->vips[t->vip_index]); + } + if (pool_is_free_index(lbm->ass, t->as_index)) { + s = format(s, "lb as[%d]: This AS was freed since capture\n"); + } else { + s = format(s, "lb as[%d]: %U\n", t->as_index, format_lb_as, &lbm->ass[t->as_index]); + } + return s; +} + +lb_hash_t *lb_get_sticky_table(u32 thread_index) +{ + lb_main_t *lbm = &lb_main; + lb_hash_t *sticky_ht = lbm->per_cpu[thread_index].sticky_ht; + //Check if size changed + if (PREDICT_FALSE(sticky_ht && (lbm->per_cpu_sticky_buckets != lb_hash_nbuckets(sticky_ht)))) + { + //Dereference everything in there + lb_hash_bucket_t *b; + u32 i; + lb_hash_foreach_entry(sticky_ht, b, i) { + vlib_refcount_add(&lbm->as_refcount, thread_index, b->value[i], -1); + vlib_refcount_add(&lbm->as_refcount, thread_index, 0, 1); + } + + lb_hash_free(sticky_ht); + sticky_ht = NULL; + } + + //Create if necessary + if (PREDICT_FALSE(sticky_ht == NULL)) { + lbm->per_cpu[thread_index].sticky_ht = lb_hash_alloc(lbm->per_cpu_sticky_buckets, lbm->flow_timeout); + sticky_ht = lbm->per_cpu[thread_index].sticky_ht; + clib_warning("Regenerated sticky table %p", sticky_ht); + } + + ASSERT(sticky_ht); + + //Update timeout + sticky_ht->timeout = lbm->flow_timeout; + return sticky_ht; +} + +u64 +lb_node_get_other_ports4(ip4_header_t *ip40) +{ + return 0; +} + +u64 +lb_node_get_other_ports6(ip6_header_t *ip60) +{ + return 0; +} + +static_always_inline u32 +lb_node_get_hash(vlib_buffer_t *p, u8 is_input_v4) +{ + u32 hash; + if (is_input_v4) + { + ip4_header_t *ip40; + u64 ports; + ip40 = vlib_buffer_get_current (p); + if (PREDICT_TRUE (ip40->protocol == IP_PROTOCOL_TCP || + ip40->protocol == IP_PROTOCOL_UDP)) + ports = ((u64)((udp_header_t *)(ip40 + 1))->src_port << 16) | + ((u64)((udp_header_t *)(ip40 + 1))->dst_port); + else + ports = lb_node_get_other_ports4(ip40); + + hash = lb_hash_hash(*((u64 *)&ip40->address_pair), ports, + 0, 0, 0); + } + else + { + ip6_header_t *ip60; + ip60 = vlib_buffer_get_current (p); + u64 ports; + if (PREDICT_TRUE (ip60->protocol == IP_PROTOCOL_TCP || + ip60->protocol == IP_PROTOCOL_UDP)) + ports = ((u64)((udp_header_t *)(ip60 + 1))->src_port << 16) | + ((u64)((udp_header_t *)(ip60 + 1))->dst_port); + else + ports = lb_node_get_other_ports6(ip60); + + hash = lb_hash_hash(ip60->src_address.as_u64[0], + ip60->src_address.as_u64[1], + ip60->dst_address.as_u64[0], + ip60->dst_address.as_u64[1], + ports); + } + return hash; +} + +static_always_inline uword +lb_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame, + u8 is_input_v4, //Compile-time parameter stating that is input is v4 (or v6) + u8 is_encap_v4) //Compile-time parameter stating that is GRE encap is v4 (or v6) +{ + lb_main_t *lbm = &lb_main; + u32 n_left_from, *from, next_index, *to_next, n_left_to_next; + u32 thread_index = vlib_get_thread_index(); + u32 lb_time = lb_hash_time_now(vm); + + lb_hash_t *sticky_ht = lb_get_sticky_table(thread_index); + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + u32 nexthash0 = 0; + if (PREDICT_TRUE(n_left_from > 0)) + nexthash0 = lb_node_get_hash(vlib_get_buffer (vm, from[0]), is_input_v4); + + while (n_left_from > 0) + { + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 pi0; + vlib_buffer_t *p0; + lb_vip_t *vip0; + u32 asindex0; + u16 len0; + u32 available_index0; + u8 counter = 0; + u32 hash0 = nexthash0; + + if (PREDICT_TRUE(n_left_from > 1)) + { + vlib_buffer_t *p1 = vlib_get_buffer (vm, from[1]); + //Compute next hash and prefetch bucket + nexthash0 = lb_node_get_hash(p1, is_input_v4); + lb_hash_prefetch_bucket(sticky_ht, nexthash0); + //Prefetch for encap, next + CLIB_PREFETCH (vlib_buffer_get_current(p1) - 64, 64, STORE); + } + + if (PREDICT_TRUE(n_left_from > 2)) + { + vlib_buffer_t *p2; + p2 = vlib_get_buffer(vm, from[2]); + /* prefetch packet header and data */ + vlib_prefetch_buffer_header(p2, STORE); + CLIB_PREFETCH (vlib_buffer_get_current(p2), 64, STORE); + } + + pi0 = to_next[0] = from[0]; + from += 1; + n_left_from -= 1; + to_next += 1; + n_left_to_next -= 1; + + p0 = vlib_get_buffer (vm, pi0); + vip0 = pool_elt_at_index (lbm->vips, + vnet_buffer (p0)->ip.adj_index[VLIB_TX]); + + if (is_input_v4) + { + ip4_header_t *ip40; + ip40 = vlib_buffer_get_current (p0); + len0 = clib_net_to_host_u16(ip40->length); + } + else + { + ip6_header_t *ip60; + ip60 = vlib_buffer_get_current (p0); + len0 = clib_net_to_host_u16(ip60->payload_length) + sizeof(ip6_header_t); + } + + lb_hash_get(sticky_ht, hash0, vnet_buffer (p0)->ip.adj_index[VLIB_TX], + lb_time, &available_index0, &asindex0); + + if (PREDICT_TRUE(asindex0 != ~0)) + { + //Found an existing entry + counter = LB_VIP_COUNTER_NEXT_PACKET; + } + else if (PREDICT_TRUE(available_index0 != ~0)) + { + //There is an available slot for a new flow + asindex0 = vip0->new_flow_table[hash0 & vip0->new_flow_table_mask].as_index; + counter = LB_VIP_COUNTER_FIRST_PACKET; + counter = (asindex0 == 0)?LB_VIP_COUNTER_NO_SERVER:counter; + + //TODO: There are race conditions with as0 and vip0 manipulation. + //Configuration may be changed, vectors resized, etc... + + //Dereference previously used + vlib_refcount_add(&lbm->as_refcount, thread_index, + lb_hash_available_value(sticky_ht, hash0, available_index0), -1); + vlib_refcount_add(&lbm->as_refcount, thread_index, + asindex0, 1); + + //Add sticky entry + //Note that when there is no AS configured, an entry is configured anyway. + //But no configured AS is not something that should happen + lb_hash_put(sticky_ht, hash0, asindex0, + vnet_buffer (p0)->ip.adj_index[VLIB_TX], + available_index0, lb_time); + } + else + { + //Could not store new entry in the table + asindex0 = vip0->new_flow_table[hash0 & vip0->new_flow_table_mask].as_index; + counter = LB_VIP_COUNTER_UNTRACKED_PACKET; + } + + vlib_increment_simple_counter(&lbm->vip_counters[counter], + thread_index, + vnet_buffer (p0)->ip.adj_index[VLIB_TX], + 1); + + //Now let's encap + { + gre_header_t *gre0; + if (is_encap_v4) + { + ip4_header_t *ip40; + vlib_buffer_advance(p0, - sizeof(ip4_header_t) - sizeof(gre_header_t)); + ip40 = vlib_buffer_get_current(p0); + gre0 = (gre_header_t *)(ip40 + 1); + ip40->src_address = lbm->ip4_src_address; + ip40->dst_address = lbm->ass[asindex0].address.ip4; + ip40->ip_version_and_header_length = 0x45; + ip40->ttl = 128; + ip40->fragment_id = 0; + ip40->flags_and_fragment_offset = 0; + ip40->length = clib_host_to_net_u16(len0 + sizeof(gre_header_t) + sizeof(ip4_header_t)); + ip40->protocol = IP_PROTOCOL_GRE; + ip40->checksum = ip4_header_checksum (ip40); + } + else + { + ip6_header_t *ip60; + vlib_buffer_advance(p0, - sizeof(ip6_header_t) - sizeof(gre_header_t)); + ip60 = vlib_buffer_get_current(p0); + gre0 = (gre_header_t *)(ip60 + 1); + ip60->dst_address = lbm->ass[asindex0].address.ip6; + ip60->src_address = lbm->ip6_src_address; + ip60->hop_limit = 128; + ip60->ip_version_traffic_class_and_flow_label = clib_host_to_net_u32 (0x6<<28); + ip60->payload_length = clib_host_to_net_u16(len0 + sizeof(gre_header_t)); + ip60->protocol = IP_PROTOCOL_GRE; + } + + gre0->flags_and_version = 0; + gre0->protocol = (is_input_v4)? + clib_host_to_net_u16(0x0800): + clib_host_to_net_u16(0x86DD); + } + + if (PREDICT_FALSE (p0->flags & VLIB_BUFFER_IS_TRACED)) + { + lb_trace_t *tr = vlib_add_trace (vm, node, p0, sizeof (*tr)); + tr->as_index = asindex0; + tr->vip_index = vnet_buffer (p0)->ip.adj_index[VLIB_TX]; + } + + //Enqueue to next + //Note that this is going to error if asindex0 == 0 + vnet_buffer (p0)->ip.adj_index[VLIB_TX] = lbm->ass[asindex0].dpo.dpoi_index; + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, pi0, + lbm->ass[asindex0].dpo.dpoi_next_node); + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + return frame->n_vectors; +} + +static uword +lb6_gre6_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + return lb_node_fn(vm, node, frame, 0, 0); +} + +static uword +lb6_gre4_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + return lb_node_fn(vm, node, frame, 0, 1); +} + +static uword +lb4_gre6_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + return lb_node_fn(vm, node, frame, 1, 0); +} + +static uword +lb4_gre4_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + return lb_node_fn(vm, node, frame, 1, 1); +} + +VLIB_REGISTER_NODE (lb6_gre6_node) = +{ + .function = lb6_gre6_node_fn, + .name = "lb6-gre6", + .vector_size = sizeof (u32), + .format_trace = format_lb_trace, + + .n_errors = LB_N_ERROR, + .error_strings = lb_error_strings, + + .n_next_nodes = LB_N_NEXT, + .next_nodes = + { + [LB_NEXT_DROP] = "error-drop" + }, +}; + +VLIB_REGISTER_NODE (lb6_gre4_node) = +{ + .function = lb6_gre4_node_fn, + .name = "lb6-gre4", + .vector_size = sizeof (u32), + .format_trace = format_lb_trace, + + .n_errors = LB_N_ERROR, + .error_strings = lb_error_strings, + + .n_next_nodes = LB_N_NEXT, + .next_nodes = + { + [LB_NEXT_DROP] = "error-drop" + }, +}; + +VLIB_REGISTER_NODE (lb4_gre6_node) = +{ + .function = lb4_gre6_node_fn, + .name = "lb4-gre6", + .vector_size = sizeof (u32), + .format_trace = format_lb_trace, + + .n_errors = LB_N_ERROR, + .error_strings = lb_error_strings, + + .n_next_nodes = LB_N_NEXT, + .next_nodes = + { + [LB_NEXT_DROP] = "error-drop" + }, +}; + +VLIB_REGISTER_NODE (lb4_gre4_node) = +{ + .function = lb4_gre4_node_fn, + .name = "lb4-gre4", + .vector_size = sizeof (u32), + .format_trace = format_lb_trace, + + .n_errors = LB_N_ERROR, + .error_strings = lb_error_strings, + + .n_next_nodes = LB_N_NEXT, + .next_nodes = + { + [LB_NEXT_DROP] = "error-drop" + }, +}; + diff --git a/src/plugins/lb/refcount.c b/src/plugins/lb/refcount.c new file mode 100644 index 00000000..6f01ab5a --- /dev/null +++ b/src/plugins/lb/refcount.c @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <lb/refcount.h> + +void __vlib_refcount_resize(vlib_refcount_per_cpu_t *per_cpu, u32 size) +{ + u32 *new_counter = 0, *old_counter; + vec_validate(new_counter, size); + memcpy(new_counter, per_cpu->counters, per_cpu->length); + old_counter = per_cpu->counters; + per_cpu->counters = new_counter; + CLIB_MEMORY_BARRIER(); + per_cpu->length = vec_len(new_counter); + vec_free(old_counter); +} + +u64 vlib_refcount_get(vlib_refcount_t *r, u32 index) +{ + u64 count = 0; + vlib_thread_main_t *tm = vlib_get_thread_main (); + u32 thread_index; + for (thread_index = 0; thread_index < tm->n_vlib_mains; thread_index++) { + if (r->per_cpu[thread_index].length > index) + count += r->per_cpu[thread_index].counters[index]; + } + return count; +} + diff --git a/src/plugins/lb/refcount.h b/src/plugins/lb/refcount.h new file mode 100644 index 00000000..dcfcb3fe --- /dev/null +++ b/src/plugins/lb/refcount.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * vlib provides lock-free counters but those + * - Have 16bits per-CPU counter, which may overflow. + * - Would only increment. + * + * This is very similar to vlib counters, but may be used to count reference. + * Such a counter includes an arbitrary number of counters. Each counter + * is identified by its index. This is used to aggregate per-cpu memory. + * + * Warning: + * This reference counter is lock-free but is not race-condition free. + * The counting result is approximate and another mechanism needs to be used + * in order to ensure that an object may be freed. + * + */ + +#include <vnet/vnet.h> + +typedef struct { + u32 *counters; + u32 length; + u32 *reader_lengths; + CLIB_CACHE_LINE_ALIGN_MARK(o); +} vlib_refcount_per_cpu_t; + +typedef struct { + vlib_refcount_per_cpu_t *per_cpu; +} vlib_refcount_t; + +void __vlib_refcount_resize(vlib_refcount_per_cpu_t *per_cpu, u32 size); + +static_always_inline +void vlib_refcount_add(vlib_refcount_t *r, u32 thread_index, u32 counter_index, i32 v) +{ + vlib_refcount_per_cpu_t *per_cpu = &r->per_cpu[thread_index]; + if (PREDICT_FALSE(counter_index >= per_cpu->length)) + __vlib_refcount_resize(per_cpu, clib_max(counter_index + 16, per_cpu->length * 2)); + + per_cpu->counters[counter_index] += v; +} + +u64 vlib_refcount_get(vlib_refcount_t *r, u32 index); + +static_always_inline +void vlib_refcount_init(vlib_refcount_t *r) +{ + vlib_thread_main_t *tm = vlib_get_thread_main (); + r->per_cpu = 0; + vec_validate (r->per_cpu, tm->n_vlib_mains - 1); +} + + diff --git a/src/plugins/lb/util.c b/src/plugins/lb/util.c new file mode 100644 index 00000000..d969d168 --- /dev/null +++ b/src/plugins/lb/util.c @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <lb/util.h> + +void ip46_prefix_normalize(ip46_address_t *prefix, u8 plen) +{ + if (plen == 0) { + prefix->as_u64[0] = 0; + prefix->as_u64[1] = 0; + } else if (plen <= 64) { + prefix->as_u64[0] &= clib_host_to_net_u64(0xffffffffffffffffL << (64 - plen)); + prefix->as_u64[1] = 0; + } else { + prefix->as_u64[1] &= clib_host_to_net_u64(0xffffffffffffffffL << (128 - plen)); + } + +} + +uword unformat_ip46_prefix (unformat_input_t * input, va_list * args) +{ + ip46_address_t *ip46 = va_arg (*args, ip46_address_t *); + u8 *len = va_arg (*args, u8 *); + ip46_type_t type = va_arg (*args, ip46_type_t); + + u32 l; + if ((type != IP46_TYPE_IP6) && unformat(input, "%U/%u", unformat_ip4_address, &ip46->ip4, &l)) { + if (l > 32) + return 0; + *len = l + 96; + ip46->pad[0] = ip46->pad[1] = ip46->pad[2] = 0; + } else if ((type != IP46_TYPE_IP4) && unformat(input, "%U/%u", unformat_ip6_address, &ip46->ip6, &l)) { + if (l > 128) + return 0; + *len = l; + } else { + return 0; + } + return 1; +} + +u8 *format_ip46_prefix (u8 * s, va_list * args) +{ + ip46_address_t *ip46 = va_arg (*args, ip46_address_t *); + u32 len = va_arg (*args, u32); //va_arg cannot use u8 or u16 + ip46_type_t type = va_arg (*args, ip46_type_t); + + int is_ip4 = 0; + if (type == IP46_TYPE_IP4) + is_ip4 = 1; + else if (type == IP46_TYPE_IP6) + is_ip4 = 0; + else + is_ip4 = (len >= 96) && ip46_address_is_ip4(ip46); + + return is_ip4 ? + format(s, "%U/%d", format_ip4_address, &ip46->ip4, len - 96): + format(s, "%U/%d", format_ip6_address, &ip46->ip6, len); +} + diff --git a/src/plugins/lb/util.h b/src/plugins/lb/util.h new file mode 100644 index 00000000..3f082310 --- /dev/null +++ b/src/plugins/lb/util.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Non-LB specific stuff comes here + */ + +#ifndef LB_PLUGIN_LB_UTIL_H_ +#define LB_PLUGIN_LB_UTIL_H_ + +#include <vnet/vnet.h> +#include <vnet/ip/ip.h> + +#define ip46_address_type(ip46) (ip46_address_is_ip4(ip46)?IP46_TYPE_IP4:IP46_TYPE_IP6) +#define ip46_prefix_is_ip4(ip46, len) ((len) >= 96 && ip46_address_is_ip4(ip46)) +#define ip46_prefix_type(ip46, len) (ip46_prefix_is_ip4(ip46, len)?IP46_TYPE_IP4:IP46_TYPE_IP6) + +void ip46_prefix_normalize(ip46_address_t *prefix, u8 plen); +uword unformat_ip46_prefix (unformat_input_t * input, va_list * args); +u8 *format_ip46_prefix (u8 * s, va_list * args); + +/** + * 32 bits integer comparison for running values. + * 1 > 0 is true. But 1 > 0xffffffff also is. + */ +#define clib_u32_loop_gt(a, b) (((u32)(a)) - ((u32)(b)) < 0x7fffffff) + +#endif /* LB_PLUGIN_LB_UTIL_H_ */ diff --git a/src/plugins/memif.am b/src/plugins/memif.am new file mode 100644 index 00000000..15147e77 --- /dev/null +++ b/src/plugins/memif.am @@ -0,0 +1,37 @@ +# Copyright (c) 2017 Cisco Systems, Inc. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +vppplugins_LTLIBRARIES += memif_plugin.la +vppapitestplugins_LTLIBRARIES += memif_test_plugin.la + +memif_plugin_la_SOURCES = memif/memif.c \ + memif/memif_api.c \ + memif/cli.c \ + memif/node.c \ + memif/device.c \ + memif/socket.c \ + memif/memif_plugin.api.h + +memif_test_plugin_la_SOURCES = \ + memif/memif_test.c memif/memif_plugin.api.h + +noinst_HEADERS += memif/memif.h + +nobase_apiinclude_HEADERS += \ + memif/memif_all_api_h.h \ + memif/memif_msg_enum.h \ + memif/memif.api.h + +API_FILES += memif/memif.api + +# vi:syntax=automake diff --git a/src/plugins/memif/cli.c b/src/plugins/memif/cli.c new file mode 100644 index 00000000..e1bd0444 --- /dev/null +++ b/src/plugins/memif/cli.c @@ -0,0 +1,365 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ +#include <stdint.h> +#include <net/if.h> +#include <sys/ioctl.h> +#include <inttypes.h> + +#include <vlib/vlib.h> +#include <vlib/unix/unix.h> +#include <vnet/ethernet/ethernet.h> + +#include <memif/memif.h> +#include <memif/private.h> + +static clib_error_t * +memif_create_command_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + int r; + u32 ring_size = MEMIF_DEFAULT_RING_SIZE; + memif_create_if_args_t args = { 0 }; + args.buffer_size = MEMIF_DEFAULT_BUFFER_SIZE; + u32 rx_queues = MEMIF_DEFAULT_RX_QUEUES; + u32 tx_queues = MEMIF_DEFAULT_TX_QUEUES; + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "id %u", &args.id)) + ; + else if (unformat (line_input, "socket %s", &args.socket_filename)) + ; + else if (unformat (line_input, "secret %s", &args.secret)) + ; + else if (unformat (line_input, "ring-size %u", &ring_size)) + ; + else if (unformat (line_input, "rx-queues %u", &rx_queues)) + ; + else if (unformat (line_input, "tx-queues %u", &tx_queues)) + ; + else if (unformat (line_input, "buffer-size %u", &args.buffer_size)) + ; + else if (unformat (line_input, "master")) + args.is_master = 1; + else if (unformat (line_input, "slave")) + args.is_master = 0; + else if (unformat (line_input, "mode ip")) + args.mode = MEMIF_INTERFACE_MODE_IP; + else if (unformat (line_input, "hw-addr %U", + unformat_ethernet_address, args.hw_addr)) + args.hw_addr_set = 1; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + unformat_free (line_input); + + if (!is_pow2 (ring_size)) + return clib_error_return (0, "ring size must be power of 2"); + + args.log2_ring_size = min_log2 (ring_size); + + if (rx_queues > 255 || rx_queues < 1) + return clib_error_return (0, "rx queue must be between 1 - 255"); + if (tx_queues > 255 || tx_queues < 1) + return clib_error_return (0, "tx queue must be between 1 - 255"); + + args.rx_queues = rx_queues; + args.tx_queues = tx_queues; + + r = memif_create_if (vm, &args); + + vec_free (args.socket_filename); + vec_free (args.secret); + + if (r <= VNET_API_ERROR_SYSCALL_ERROR_1 + && r >= VNET_API_ERROR_SYSCALL_ERROR_10) + return clib_error_return (0, "%s (errno %d)", strerror (errno), errno); + + if (r == VNET_API_ERROR_INVALID_INTERFACE) + return clib_error_return (0, "Invalid interface name"); + + if (r == VNET_API_ERROR_SUBIF_ALREADY_EXISTS) + return clib_error_return (0, "Interface with same id already exists"); + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (memif_create_command, static) = { + .path = "create memif", + .short_help = "create memif [id <id>] [socket <path>] " + "[ring-size <size>] [buffer-size <size>] [hw-addr <mac-address>] " + "<master|slave> [rx-queues <number>] [tx-queues <number>] " + "[mode ip] [secret <string>]", + .function = memif_create_command_fn, +}; +/* *INDENT-ON* */ + +static clib_error_t * +memif_delete_command_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + u32 sw_if_index = ~0; + vnet_hw_interface_t *hw; + memif_main_t *mm = &memif_main; + memif_if_t *mif; + vnet_main_t *vnm = vnet_get_main (); + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "sw_if_index %d", &sw_if_index)) + ; + else if (unformat (line_input, "%U", unformat_vnet_sw_interface, + vnm, &sw_if_index)) + ; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + unformat_free (line_input); + + if (sw_if_index == ~0) + return clib_error_return (0, + "please specify interface name or sw_if_index"); + + hw = vnet_get_sup_hw_interface (vnm, sw_if_index); + if (hw == NULL || memif_device_class.index != hw->dev_class_index) + return clib_error_return (0, "not a memif interface"); + + mif = pool_elt_at_index (mm->interfaces, hw->dev_instance); + memif_delete_if (vm, mif); + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (memif_delete_command, static) = { + .path = "delete memif", + .short_help = "delete memif {<interface> | sw_if_index <sw_idx>}", + .function = memif_delete_command_fn, +}; +/* *INDENT-ON* */ + +static u8 * +format_memif_if_flags (u8 * s, va_list * args) +{ + u32 flags = va_arg (*args, u32); +#define _(a,b,c) if ( flags & (1 << a)) s = format (s, " %s", c); + foreach_memif_if_flag +#undef _ + return s; +} + +static u8 * +format_memif_if_mode (u8 * s, va_list * args) +{ + memif_if_t *mif = va_arg (*args, memif_if_t *); + if (mif->mode == MEMIF_INTERFACE_MODE_ETHERNET) + return format (s, "ethernet"); + if (mif->mode == MEMIF_INTERFACE_MODE_IP) + return format (s, "ip"); + if (mif->mode == MEMIF_INTERFACE_MODE_PUNT_INJECT) + return format (s, "punt-inject"); + return format (s, "unknown mode (%u)", mif->mode);; +} + +static u8 * +format_memif_queue (u8 * s, va_list * args) +{ + memif_if_t *mif = va_arg (*args, memif_if_t *); + memif_queue_t *mq = va_arg (*args, memif_queue_t *); + uword i = va_arg (*args, uword); + uword indent = format_get_indent (s); + + s = format (s, "%U%s ring %u:\n", + format_white_space, indent, + (mif->flags & MEMIF_IF_FLAG_IS_SLAVE) ? + "slave-to-master" : "master-to-slave", i); + s = format (s, "%Uregion %u offset %u ring-size %u int-fd %d\n", + format_white_space, indent + 4, + mq->region, mq->offset, (1 << mq->log2_ring_size), mq->int_fd); + + if (mq->ring) + s = format (s, "%Uhead %u tail %u flags 0x%04x interrupts %u\n", + format_white_space, indent + 4, + mq->ring->head, mq->ring->tail, mq->ring->flags, + mq->int_count); + + return s; +} + +static u8 * +format_memif_descriptor (u8 * s, va_list * args) +{ + memif_if_t *mif = va_arg (*args, memif_if_t *); + memif_queue_t *mq = va_arg (*args, memif_queue_t *); + uword indent = format_get_indent (s); + memif_ring_t *ring; + u16 ring_size; + u16 slot; + + ring_size = 1 << mq->log2_ring_size; + ring = mq->ring; + if (ring) + { + s = format (s, "%Udescriptor table:\n", format_white_space, indent); + s = + format (s, + "%Uid flags buf len desc len address offset user address\n", + format_white_space, indent); + s = + format (s, + "%U===== ===== ======= ======== ================== ====== ==================\n", + format_white_space, indent); + for (slot = 0; slot < ring_size; slot++) + { + s = format (s, "%U%-5d %-5d %-7d %-7d 0x%016lx %-6d 0x%016lx\n", + format_white_space, indent, slot, + ring->desc[slot].flags, ring->desc[slot].buffer_length, + ring->desc[slot].length, + mif->regions[ring->desc[slot].region].shm, + ring->desc[slot].offset, memif_get_buffer (mif, ring, + slot)); + } + s = format (s, "\n"); + } + + return s; +} + +static clib_error_t * +memif_show_command_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + memif_main_t *mm = &memif_main; + memif_if_t *mif; + vnet_main_t *vnm = vnet_get_main (); + memif_queue_t *mq; + uword i; + int show_descr = 0; + clib_error_t *error = 0; + u32 hw_if_index, *hw_if_indices = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat + (input, "%U", unformat_vnet_hw_interface, vnm, &hw_if_index)) + vec_add1 (hw_if_indices, hw_if_index); + else if (unformat (input, "descriptors")) + show_descr = 1; + else + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + goto done; + } + } + + if (vec_len (hw_if_indices) == 0) + { + /* *INDENT-OFF* */ + pool_foreach (mif, mm->interfaces, + vec_add1 (hw_if_indices, mif->hw_if_index); + ); + /* *INDENT-ON* */ + } + + for (hw_if_index = 0; hw_if_index < vec_len (hw_if_indices); hw_if_index++) + { + vnet_hw_interface_t *hi = + vnet_get_hw_interface (vnm, hw_if_indices[hw_if_index]); + mif = pool_elt_at_index (mm->interfaces, hi->dev_instance); + memif_socket_file_t *msf = vec_elt_at_index (mm->socket_files, + mif->socket_file_index); + vlib_cli_output (vm, "interface %U", format_vnet_sw_if_index_name, + vnm, mif->sw_if_index); + if (mif->remote_name) + vlib_cli_output (vm, " remote-name \"%s\"", mif->remote_name); + if (mif->remote_if_name) + vlib_cli_output (vm, " remote-interface \"%s\"", + mif->remote_if_name); + vlib_cli_output (vm, " id %d mode %U file %s", mif->id, + format_memif_if_mode, mif, msf->filename); + vlib_cli_output (vm, " flags%U", format_memif_if_flags, mif->flags); + vlib_cli_output (vm, " listener-fd %d conn-fd %d", msf->fd, + mif->conn_fd); + vlib_cli_output (vm, + " num-s2m-rings %u num-m2s-rings %u buffer-size %u", + mif->run.num_s2m_rings, mif->run.num_m2s_rings, + mif->run.buffer_size); + + if (mif->local_disc_string) + vlib_cli_output (vm, " local-disc-reason \"%s\"", + mif->local_disc_string); + if (mif->remote_disc_string) + vlib_cli_output (vm, " remote-disc-reason \"%s\"", + mif->remote_disc_string); + + vec_foreach_index (i, mif->tx_queues) + { + mq = vec_elt_at_index (mif->tx_queues, i); + vlib_cli_output (vm, " %U", format_memif_queue, mif, mq, i); + if (show_descr) + vlib_cli_output (vm, " %U", format_memif_descriptor, mif, mq); + } + vec_foreach_index (i, mif->rx_queues) + { + mq = vec_elt_at_index (mif->rx_queues, i); + vlib_cli_output (vm, " %U", format_memif_queue, mif, mq, i); + if (show_descr) + vlib_cli_output (vm, " %U", format_memif_descriptor, mif, mq); + } + } +done: + vec_free (hw_if_indices); + return error; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (memif_show_command, static) = { + .path = "show memif", + .short_help = "show memif {<interface>] [descriptors]", + .function = memif_show_command_fn, +}; +/* *INDENT-ON* */ + +clib_error_t * +memif_cli_init (vlib_main_t * vm) +{ + return 0; +} + +VLIB_INIT_FUNCTION (memif_cli_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/memif/device.c b/src/plugins/memif/device.c new file mode 100644 index 00000000..aff18f2d --- /dev/null +++ b/src/plugins/memif/device.c @@ -0,0 +1,380 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#define _GNU_SOURCE +#include <stdint.h> +#include <net/if.h> +#include <sys/ioctl.h> +#include <sys/uio.h> + +#include <vlib/vlib.h> +#include <vlib/unix/unix.h> +#include <vnet/ethernet/ethernet.h> + +#include <memif/memif.h> +#include <memif/private.h> + +#define foreach_memif_tx_func_error \ +_(NO_FREE_SLOTS, "no free tx slots") \ +_(TRUNC_PACKET, "packet > buffer size -- truncated in tx ring") \ +_(PENDING_MSGS, "pending msgs in tx ring") \ +_(NO_TX_QUEUES, "no tx queues") + +typedef enum +{ +#define _(f,s) MEMIF_TX_ERROR_##f, + foreach_memif_tx_func_error +#undef _ + MEMIF_TX_N_ERROR, +} memif_tx_func_error_t; + +static char *memif_tx_func_error_strings[] = { +#define _(n,s) s, + foreach_memif_tx_func_error +#undef _ +}; + +u8 * +format_memif_device_name (u8 * s, va_list * args) +{ + u32 dev_instance = va_arg (*args, u32); + memif_main_t *mm = &memif_main; + memif_if_t *mif = pool_elt_at_index (mm->interfaces, dev_instance); + + s = format (s, "memif%lu/%lu", mif->socket_file_index, mif->id); + return s; +} + +static u8 * +format_memif_device (u8 * s, va_list * args) +{ + u32 dev_instance = va_arg (*args, u32); + int verbose = va_arg (*args, int); + uword indent = format_get_indent (s); + + s = format (s, "MEMIF interface"); + if (verbose) + { + s = format (s, "\n%U instance %u", format_white_space, indent + 2, + dev_instance); + } + return s; +} + +static u8 * +format_memif_tx_trace (u8 * s, va_list * args) +{ + s = format (s, "Unimplemented..."); + return s; +} + +static_always_inline void +memif_prefetch_buffer_and_data (vlib_main_t * vm, u32 bi) +{ + vlib_buffer_t *b = vlib_get_buffer (vm, bi); + vlib_prefetch_buffer_header (b, LOAD); + CLIB_PREFETCH (b->data, CLIB_CACHE_LINE_BYTES, LOAD); +} + +/** + * @brief Copy buffer to tx ring + * + * @param * vm (in) + * @param * node (in) + * @param * mif (in) pointer to memif interface + * @param bi (in) vlib buffer index + * @param * ring (in) pointer to memif ring + * @param * head (in/out) ring head + * @param mask (in) ring size - 1 + */ +static_always_inline void +memif_copy_buffer_to_tx_ring (vlib_main_t * vm, vlib_node_runtime_t * node, + memif_if_t * mif, u32 bi, memif_ring_t * ring, + u16 * head, u16 mask) +{ + vlib_buffer_t *b0; + void *mb0; + u32 total = 0, len; + + mb0 = memif_get_buffer (mif, ring, *head); + ring->desc[*head].flags = 0; + do + { + b0 = vlib_get_buffer (vm, bi); + len = b0->current_length; + if (PREDICT_FALSE (ring->desc[*head].buffer_length < (total + len))) + { + if (PREDICT_TRUE (total)) + { + ring->desc[*head].length = total; + total = 0; + ring->desc[*head].flags |= MEMIF_DESC_FLAG_NEXT; + *head = (*head + 1) & mask; + mb0 = memif_get_buffer (mif, ring, *head); + ring->desc[*head].flags = 0; + } + } + if (PREDICT_TRUE (ring->desc[*head].buffer_length >= (total + len))) + { + clib_memcpy (mb0 + total, vlib_buffer_get_current (b0), + CLIB_CACHE_LINE_BYTES); + if (len > CLIB_CACHE_LINE_BYTES) + clib_memcpy (mb0 + CLIB_CACHE_LINE_BYTES + total, + vlib_buffer_get_current (b0) + CLIB_CACHE_LINE_BYTES, + len - CLIB_CACHE_LINE_BYTES); + total += len; + } + else + { + vlib_error_count (vm, node->node_index, MEMIF_TX_ERROR_TRUNC_PACKET, + 1); + break; + } + } + while ((bi = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) ? b0->next_buffer : 0)); + + if (PREDICT_TRUE (total)) + { + ring->desc[*head].length = total; + *head = (*head + 1) & mask; + } +} + +static_always_inline uword +memif_interface_tx_inline (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame, memif_if_t * mif, + memif_ring_type_t type) +{ + u8 qid; + memif_ring_t *ring; + u32 *buffers = vlib_frame_args (frame); + u32 n_left = frame->n_vectors; + u16 ring_size, mask; + u16 head, tail; + u16 free_slots; + u32 thread_index = vlib_get_thread_index (); + u8 tx_queues = vec_len (mif->tx_queues); + memif_queue_t *mq; + + if (PREDICT_FALSE (tx_queues == 0)) + { + vlib_error_count (vm, node->node_index, MEMIF_TX_ERROR_NO_TX_QUEUES, + n_left); + goto error; + } + + if (tx_queues < vec_len (vlib_mains)) + { + qid = thread_index % tx_queues; + clib_spinlock_lock_if_init (&mif->lockp); + } + else + { + qid = thread_index; + } + mq = vec_elt_at_index (mif->tx_queues, qid); + ring = mq->ring; + ring_size = 1 << mq->log2_ring_size; + mask = ring_size - 1; + + /* free consumed buffers */ + + head = ring->head; + tail = ring->tail; + + if (tail > head) + free_slots = tail - head; + else + free_slots = ring_size - head + tail; + + while (n_left > 5 && free_slots > 1) + { + if (PREDICT_TRUE (head + 5 < ring_size)) + { + CLIB_PREFETCH (memif_get_buffer (mif, ring, head + 2), + CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (memif_get_buffer (mif, ring, head + 3), + CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (&ring->desc[head + 4], CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (&ring->desc[head + 5], CLIB_CACHE_LINE_BYTES, STORE); + } + else + { + CLIB_PREFETCH (memif_get_buffer (mif, ring, (head + 2) % mask), + CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (memif_get_buffer (mif, ring, (head + 3) % mask), + CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (&ring->desc[(head + 4) % mask], + CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (&ring->desc[(head + 5) % mask], + CLIB_CACHE_LINE_BYTES, STORE); + } + + memif_prefetch_buffer_and_data (vm, buffers[2]); + memif_prefetch_buffer_and_data (vm, buffers[3]); + + memif_copy_buffer_to_tx_ring (vm, node, mif, buffers[0], ring, &head, + mask); + memif_copy_buffer_to_tx_ring (vm, node, mif, buffers[1], ring, &head, + mask); + + buffers += 2; + n_left -= 2; + free_slots -= 2; + } + + while (n_left && free_slots) + { + memif_copy_buffer_to_tx_ring (vm, node, mif, buffers[0], ring, &head, + mask); + buffers++; + n_left--; + free_slots--; + } + + CLIB_MEMORY_STORE_BARRIER (); + ring->head = head; + + clib_spinlock_unlock_if_init (&mif->lockp); + + if (n_left) + { + vlib_error_count (vm, node->node_index, MEMIF_TX_ERROR_NO_FREE_SLOTS, + n_left); + } + + if ((ring->flags & MEMIF_RING_FLAG_MASK_INT) == 0 && mq->int_fd > -1) + { + u64 b = 1; + CLIB_UNUSED (int r) = write (mq->int_fd, &b, sizeof (b)); + mq->int_count++; + } + +error: + vlib_buffer_free (vm, vlib_frame_args (frame), frame->n_vectors); + + return frame->n_vectors; +} + +static uword +memif_interface_tx (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + memif_main_t *nm = &memif_main; + vnet_interface_output_runtime_t *rund = (void *) node->runtime_data; + memif_if_t *mif = pool_elt_at_index (nm->interfaces, rund->dev_instance); + + if (mif->flags & MEMIF_IF_FLAG_IS_SLAVE) + return memif_interface_tx_inline (vm, node, frame, mif, MEMIF_RING_S2M); + else + return memif_interface_tx_inline (vm, node, frame, mif, MEMIF_RING_M2S); +} + +static void +memif_set_interface_next_node (vnet_main_t * vnm, u32 hw_if_index, + u32 node_index) +{ + memif_main_t *apm = &memif_main; + vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); + memif_if_t *mif = pool_elt_at_index (apm->interfaces, hw->dev_instance); + + /* Shut off redirection */ + if (node_index == ~0) + { + mif->per_interface_next_index = node_index; + return; + } + + mif->per_interface_next_index = + vlib_node_add_next (vlib_get_main (), memif_input_node.index, node_index); +} + +static void +memif_clear_hw_interface_counters (u32 instance) +{ + /* Nothing for now */ +} + +static clib_error_t * +memif_interface_rx_mode_change (vnet_main_t * vnm, u32 hw_if_index, u32 qid, + vnet_hw_interface_rx_mode mode) +{ + memif_main_t *mm = &memif_main; + vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); + memif_if_t *mif = pool_elt_at_index (mm->interfaces, hw->dev_instance); + memif_queue_t *mq = vec_elt_at_index (mif->rx_queues, qid); + + if (mode == VNET_HW_INTERFACE_RX_MODE_POLLING) + mq->ring->flags |= MEMIF_RING_FLAG_MASK_INT; + else + mq->ring->flags &= ~MEMIF_RING_FLAG_MASK_INT; + + return 0; +} + +static clib_error_t * +memif_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags) +{ + memif_main_t *mm = &memif_main; + vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); + memif_if_t *mif = pool_elt_at_index (mm->interfaces, hw->dev_instance); + static clib_error_t *error = 0; + + if (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) + mif->flags |= MEMIF_IF_FLAG_ADMIN_UP; + else + mif->flags &= ~MEMIF_IF_FLAG_ADMIN_UP; + + return error; +} + +static clib_error_t * +memif_subif_add_del_function (vnet_main_t * vnm, + u32 hw_if_index, + struct vnet_sw_interface_t *st, int is_add) +{ + /* Nothing for now */ + return 0; +} + +/* *INDENT-OFF* */ +VNET_DEVICE_CLASS (memif_device_class) = { + .name = "memif", + .tx_function = memif_interface_tx, + .format_device_name = format_memif_device_name, + .format_device = format_memif_device, + .format_tx_trace = format_memif_tx_trace, + .tx_function_n_errors = MEMIF_TX_N_ERROR, + .tx_function_error_strings = memif_tx_func_error_strings, + .rx_redirect_to_node = memif_set_interface_next_node, + .clear_counters = memif_clear_hw_interface_counters, + .admin_up_down_function = memif_interface_admin_up_down, + .subif_add_del_function = memif_subif_add_del_function, + .rx_mode_change_function = memif_interface_rx_mode_change, +}; + +VLIB_DEVICE_TX_FUNCTION_MULTIARCH(memif_device_class, + memif_interface_tx) +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/memif/memif.api b/src/plugins/memif/memif.api new file mode 100644 index 00000000..c9632d10 --- /dev/null +++ b/src/plugins/memif/memif.api @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** \brief Create memory interface + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param role - role of the interface in the connection (master/slave) + @param mode - interface mode + @param rx_queues - number of rx queues (only valid for slave) + #param tx_queues - number of tx queues (only valid for slave) + @param id - 32bit integer used to authenticate and match opposite sides + of the connection + @param socket_filename - filename of the socket to be used for connection + establishment + @param ring_size - the number of entries of RX/TX rings + @param buffer_size - size of the buffer allocated for each ring entry + @param hw_addr - interface MAC address +*/ +define memif_create +{ + u32 client_index; + u32 context; + + u8 role; /* 0 = master, 1 = slave */ + u8 mode; /* 0 = ethernet, 1 = ip, 2 = punt/inject */ + u8 rx_queues; /* optional, default is 1 */ + u8 tx_queues; /* optional, default is 1 */ + u32 id; /* optional, default is 0 */ + u8 socket_filename[128]; /* optional, default is "/var/vpp/memif.sock" */ + u8 secret[24]; /* optional, default is "" */ + u32 ring_size; /* optional, default is 1024 entries, must be power of 2 */ + u16 buffer_size; /* optional, default is 2048 bytes */ + u8 hw_addr[6]; /* optional, randomly generated if not defined */ +}; + +/** \brief Create memory interface response + @param context - sender context, to match reply w/ request + @param retval - return value for request + @param sw_if_index - software index of the newly created interface +*/ +define memif_create_reply +{ + u32 context; + i32 retval; + u32 sw_if_index; +}; + +/** \brief Delete memory interface + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param sw_if_index - software index of the interface to delete +*/ +autoreply define memif_delete +{ + u32 client_index; + u32 context; + + u32 sw_if_index; +}; + +/** \brief Memory interface details structure + @param context - sender context, to match reply w/ request (memif_dump) + @param sw_if_index - index of the interface + @param if_name - name of the interface + @param hw_addr - interface MAC address + @param id - id associated with the interface + @param role - role of the interface in the connection (master/slave) + @param mode - interface mode + @param socket_filename - name of the socket used by this interface + to establish new connections + @param ring_size - the number of entries of RX/TX rings + @param buffer_size - size of the buffer allocated for each ring entry + @param admin_up_down - interface administrative status + @param link_up_down - interface link status + +*/ +define memif_details +{ + u32 context; + + u32 sw_if_index; + u8 if_name[64]; + u8 hw_addr[6]; + + /* memif specific parameters */ + u32 id; + u8 role; /* 0 = master, 1 = slave */ + u8 mode; /* 0 = ethernet, 1 = ip, 2 = punt/inject */ + u8 socket_filename[128]; + u32 ring_size; + u16 buffer_size; /* optional, default is 2048 bytes */ + + /* 1 = up, 0 = down */ + u8 admin_up_down; + u8 link_up_down; +}; + +/** \brief Dump all memory interfaces + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request +*/ +define memif_dump +{ + u32 client_index; + u32 context; +}; + +/* + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/memif/memif.c b/src/plugins/memif/memif.c new file mode 100644 index 00000000..a3be49fa --- /dev/null +++ b/src/plugins/memif/memif.c @@ -0,0 +1,819 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + + +#define _GNU_SOURCE +#include <stdint.h> +#include <net/if.h> +#include <sys/types.h> +#include <fcntl.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <sys/uio.h> +#include <sys/mman.h> +#include <sys/prctl.h> +#include <sys/eventfd.h> +#include <inttypes.h> +#include <limits.h> + +#include <vlib/vlib.h> +#include <vlib/unix/unix.h> +#include <vppinfra/linux/syscall.h> +#include <vnet/plugin/plugin.h> +#include <vnet/ethernet/ethernet.h> +#include <vpp/app/version.h> +#include <memif/memif.h> +#include <memif/private.h> + +memif_main_t memif_main; + +static u32 +memif_eth_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hi, u32 flags) +{ + /* nothing for now */ + return 0; +} + +static void +memif_queue_intfd_close (memif_queue_t * mq) +{ + if (mq->int_clib_file_index != ~0) + { + memif_file_del_by_index (mq->int_clib_file_index); + mq->int_clib_file_index = ~0; + mq->int_fd = -1; + } + else if (mq->int_fd > -1) + { + close (mq->int_fd); + mq->int_fd = -1; + } +} + +void +memif_disconnect (memif_if_t * mif, clib_error_t * err) +{ + memif_main_t *mm = &memif_main; + vnet_main_t *vnm = vnet_get_main (); + memif_region_t *mr; + memif_queue_t *mq; + int i; + + if (mif == 0) + return; + + DBG ("disconnect %u (%v)", mif->dev_instance, err ? err->what : 0); + + if (err) + { + clib_error_t *e = 0; + mif->local_disc_string = vec_dup (err->what); + if (mif->conn_fd > -1) + e = memif_msg_send_disconnect (mif, err); + clib_error_free (e); + } + + /* set interface down */ + mif->flags &= ~(MEMIF_IF_FLAG_CONNECTED | MEMIF_IF_FLAG_CONNECTING); + if (mif->hw_if_index != ~0) + vnet_hw_interface_set_flags (vnm, mif->hw_if_index, 0); + + /* close connection socket */ + if (mif->conn_clib_file_index != ~0) + { + memif_socket_file_t *msf = vec_elt_at_index (mm->socket_files, + mif->socket_file_index); + hash_unset (msf->dev_instance_by_fd, mif->conn_fd); + memif_file_del_by_index (mif->conn_clib_file_index); + mif->conn_clib_file_index = ~0; + } + else if (mif->conn_fd > -1) + close (mif->conn_fd); + mif->conn_fd = -1; + + vec_foreach_index (i, mif->rx_queues) + { + mq = vec_elt_at_index (mif->rx_queues, i); + if (mq->ring) + { + int rv; + rv = vnet_hw_interface_unassign_rx_thread (vnm, mif->hw_if_index, i); + if (rv) + DBG ("Warning: unable to unassign interface %d, " + "queue %d: rc=%d", mif->hw_if_index, i, rv); + mq->ring = 0; + } + } + + /* free tx and rx queues */ + vec_foreach (mq, mif->rx_queues) memif_queue_intfd_close (mq); + vec_free (mif->rx_queues); + + vec_foreach (mq, mif->tx_queues) memif_queue_intfd_close (mq); + vec_free (mif->tx_queues); + + /* free memory regions */ + vec_foreach (mr, mif->regions) + { + int rv; + if ((rv = munmap (mr->shm, mr->region_size))) + clib_warning ("munmap failed, rv = %d", rv); + if (mr->fd > -1) + close (mr->fd); + } + vec_free (mif->regions); + + mif->remote_pid = 0; + vec_free (mif->remote_name); + vec_free (mif->remote_if_name); + clib_fifo_free (mif->msg_queue); +} + +static clib_error_t * +memif_int_fd_read_ready (clib_file_t * uf) +{ + memif_main_t *mm = &memif_main; + vnet_main_t *vnm = vnet_get_main (); + u16 qid = uf->private_data & 0xFFFF; + memif_if_t *mif = vec_elt_at_index (mm->interfaces, uf->private_data >> 16); + memif_queue_t *mq = vec_elt_at_index (mif->rx_queues, qid); + u64 b; + ssize_t size; + + size = read (uf->file_descriptor, &b, sizeof (b)); + if (size < 0) + { + DBG_UNIX_LOG ("Failed to read from socket"); + return 0; + } + + vnet_device_input_set_interrupt_pending (vnm, mif->hw_if_index, qid); + mq->int_count++; + + return 0; +} + + +clib_error_t * +memif_connect (memif_if_t * mif) +{ + vnet_main_t *vnm = vnet_get_main (); + clib_file_t template = { 0 }; + memif_region_t *mr; + int i; + + DBG ("connect %u", mif->dev_instance); + + vec_free (mif->local_disc_string); + vec_free (mif->remote_disc_string); + + vec_foreach (mr, mif->regions) + { + if (mr->shm) + continue; + + if (mr->fd < 0) + clib_error_return (0, "no memory region fd"); + + if ((mr->shm = mmap (NULL, mr->region_size, PROT_READ | PROT_WRITE, + MAP_SHARED, mr->fd, 0)) == MAP_FAILED) + return clib_error_return_unix (0, "mmap"); + } + + template.read_function = memif_int_fd_read_ready; + + vec_foreach_index (i, mif->tx_queues) + { + memif_queue_t *mq = vec_elt_at_index (mif->tx_queues, i); + + mq->ring = mif->regions[mq->region].shm + mq->offset; + if (mq->ring->cookie != MEMIF_COOKIE) + return clib_error_return (0, "wrong cookie on tx ring %u", i); + } + + vec_foreach_index (i, mif->rx_queues) + { + memif_queue_t *mq = vec_elt_at_index (mif->rx_queues, i); + int rv; + + mq->ring = mif->regions[mq->region].shm + mq->offset; + if (mq->ring->cookie != MEMIF_COOKIE) + return clib_error_return (0, "wrong cookie on tx ring %u", i); + + if (mq->int_fd > -1) + { + template.file_descriptor = mq->int_fd; + template.private_data = (mif->dev_instance << 16) | (i & 0xFFFF); + memif_file_add (&mq->int_clib_file_index, &template); + } + vnet_hw_interface_assign_rx_thread (vnm, mif->hw_if_index, i, ~0); + rv = vnet_hw_interface_set_rx_mode (vnm, mif->hw_if_index, i, + VNET_HW_INTERFACE_RX_MODE_DEFAULT); + if (rv) + clib_warning + ("Warning: unable to set rx mode for interface %d queue %d: " + "rc=%d", mif->hw_if_index, i, rv); + else + { + vnet_hw_interface_rx_mode rxmode; + vnet_hw_interface_get_rx_mode (vnm, mif->hw_if_index, i, &rxmode); + + if (rxmode == VNET_HW_INTERFACE_RX_MODE_POLLING) + mq->ring->flags |= MEMIF_RING_FLAG_MASK_INT; + } + } + + mif->flags &= ~MEMIF_IF_FLAG_CONNECTING; + mif->flags |= MEMIF_IF_FLAG_CONNECTED; + + vnet_hw_interface_set_flags (vnm, mif->hw_if_index, + VNET_HW_INTERFACE_FLAG_LINK_UP); + return 0; +} + +static_always_inline memif_ring_t * +memif_get_ring (memif_if_t * mif, memif_ring_type_t type, u16 ring_num) +{ + if (vec_len (mif->regions) == 0) + return NULL; + void *p = mif->regions[0].shm; + int ring_size = + sizeof (memif_ring_t) + + sizeof (memif_desc_t) * (1 << mif->run.log2_ring_size); + p += (ring_num + type * mif->run.num_s2m_rings) * ring_size; + + return (memif_ring_t *) p; +} + +clib_error_t * +memif_init_regions_and_queues (memif_if_t * mif) +{ + memif_ring_t *ring = NULL; + int i, j; + u64 buffer_offset; + memif_region_t *r; + clib_mem_vm_alloc_t alloc = { 0 }; + clib_error_t *err; + + vec_validate_aligned (mif->regions, 0, CLIB_CACHE_LINE_BYTES); + r = vec_elt_at_index (mif->regions, 0); + + buffer_offset = (mif->run.num_s2m_rings + mif->run.num_m2s_rings) * + (sizeof (memif_ring_t) + + sizeof (memif_desc_t) * (1 << mif->run.log2_ring_size)); + + r->region_size = buffer_offset + + mif->run.buffer_size * (1 << mif->run.log2_ring_size) * + (mif->run.num_s2m_rings + mif->run.num_m2s_rings); + + alloc.name = "memif region"; + alloc.size = r->region_size; + alloc.flags = CLIB_MEM_VM_F_SHARED; + + err = clib_mem_vm_ext_alloc (&alloc); + if (err) + return err; + + r->fd = alloc.fd; + r->shm = alloc.addr; + + for (i = 0; i < mif->run.num_s2m_rings; i++) + { + ring = memif_get_ring (mif, MEMIF_RING_S2M, i); + ring->head = ring->tail = 0; + ring->cookie = MEMIF_COOKIE; + for (j = 0; j < (1 << mif->run.log2_ring_size); j++) + { + u16 slot = i * (1 << mif->run.log2_ring_size) + j; + ring->desc[j].region = 0; + ring->desc[j].offset = + buffer_offset + (u32) (slot * mif->run.buffer_size); + ring->desc[j].buffer_length = mif->run.buffer_size; + } + } + for (i = 0; i < mif->run.num_m2s_rings; i++) + { + ring = memif_get_ring (mif, MEMIF_RING_M2S, i); + ring->head = ring->tail = 0; + ring->cookie = MEMIF_COOKIE; + for (j = 0; j < (1 << mif->run.log2_ring_size); j++) + { + u16 slot = + (i + mif->run.num_s2m_rings) * (1 << mif->run.log2_ring_size) + j; + ring->desc[j].region = 0; + ring->desc[j].offset = + buffer_offset + (u32) (slot * mif->run.buffer_size); + ring->desc[j].buffer_length = mif->run.buffer_size; + } + } + + ASSERT (mif->tx_queues == 0); + vec_validate_aligned (mif->tx_queues, mif->run.num_s2m_rings - 1, + CLIB_CACHE_LINE_BYTES); + vec_foreach_index (i, mif->tx_queues) + { + memif_queue_t *mq = vec_elt_at_index (mif->tx_queues, i); + if ((mq->int_fd = eventfd (0, EFD_NONBLOCK)) < 0) + return clib_error_return_unix (0, "eventfd[tx queue %u]", i); + mq->int_clib_file_index = ~0; + mq->ring = memif_get_ring (mif, MEMIF_RING_S2M, i); + mq->log2_ring_size = mif->cfg.log2_ring_size; + mq->region = 0; + mq->offset = (void *) mq->ring - (void *) mif->regions[mq->region].shm; + mq->last_head = 0; + } + + ASSERT (mif->rx_queues == 0); + vec_validate_aligned (mif->rx_queues, mif->run.num_m2s_rings - 1, + CLIB_CACHE_LINE_BYTES); + vec_foreach_index (i, mif->rx_queues) + { + memif_queue_t *mq = vec_elt_at_index (mif->rx_queues, i); + if ((mq->int_fd = eventfd (0, EFD_NONBLOCK)) < 0) + return clib_error_return_unix (0, "eventfd[rx queue %u]", i); + mq->int_clib_file_index = ~0; + mq->ring = memif_get_ring (mif, MEMIF_RING_M2S, i); + mq->log2_ring_size = mif->cfg.log2_ring_size; + mq->region = 0; + mq->offset = (void *) mq->ring - (void *) mif->regions[mq->region].shm; + mq->last_head = 0; + } + + return 0; +} + +static uword +memif_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f) +{ + memif_main_t *mm = &memif_main; + memif_if_t *mif; + struct sockaddr_un sun; + int sockfd; + uword *event_data = 0, event_type; + u8 enabled = 0; + f64 start_time, last_run_duration = 0, now; + + sockfd = socket (AF_UNIX, SOCK_SEQPACKET, 0); + if (sockfd < 0) + { + DBG_UNIX_LOG ("socket AF_UNIX"); + return 0; + } + sun.sun_family = AF_UNIX; + + while (1) + { + if (enabled) + vlib_process_wait_for_event_or_clock (vm, (f64) 3 - + last_run_duration); + else + vlib_process_wait_for_event (vm); + + event_type = vlib_process_get_events (vm, &event_data); + vec_reset_length (event_data); + + switch (event_type) + { + case ~0: + break; + case MEMIF_PROCESS_EVENT_START: + enabled = 1; + break; + case MEMIF_PROCESS_EVENT_STOP: + enabled = 0; + continue; + default: + ASSERT (0); + } + + last_run_duration = start_time = vlib_time_now (vm); + /* *INDENT-OFF* */ + pool_foreach (mif, mm->interfaces, + ({ + memif_socket_file_t * msf = vec_elt_at_index (mm->socket_files, mif->socket_file_index); + /* Allow no more than 10us without a pause */ + now = vlib_time_now (vm); + if (now > start_time + 10e-6) + { + vlib_process_suspend (vm, 100e-6); /* suspend for 100 us */ + start_time = vlib_time_now (vm); + } + + if ((mif->flags & MEMIF_IF_FLAG_ADMIN_UP) == 0) + continue; + + if (mif->flags & MEMIF_IF_FLAG_CONNECTING) + continue; + + if (mif->flags & MEMIF_IF_FLAG_CONNECTED) + continue; + + if (mif->flags & MEMIF_IF_FLAG_IS_SLAVE) + { + strncpy (sun.sun_path, (char *) msf->filename, + sizeof (sun.sun_path) - 1); + + if (connect + (sockfd, (struct sockaddr *) &sun, + sizeof (struct sockaddr_un)) == 0) + { + clib_file_t t = { 0 }; + + mif->conn_fd = sockfd; + t.read_function = memif_slave_conn_fd_read_ready; + t.write_function = memif_slave_conn_fd_write_ready; + t.error_function = memif_slave_conn_fd_error; + t.file_descriptor = mif->conn_fd; + t.private_data = mif->dev_instance; + memif_file_add (&mif->conn_clib_file_index, &t); + hash_set (msf->dev_instance_by_fd, mif->conn_fd, mif->dev_instance); + + mif->flags |= MEMIF_IF_FLAG_CONNECTING; + + /* grab another fd */ + sockfd = socket (AF_UNIX, SOCK_SEQPACKET, 0); + if (sockfd < 0) + { + DBG_UNIX_LOG ("socket AF_UNIX"); + return 0; + } + } + } + })); + /* *INDENT-ON* */ + last_run_duration = vlib_time_now (vm) - last_run_duration; + } + return 0; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (memif_process_node,static) = { + .function = memif_process, + .type = VLIB_NODE_TYPE_PROCESS, + .name = "memif-process", +}; +/* *INDENT-ON* */ + +int +memif_delete_if (vlib_main_t * vm, memif_if_t * mif) +{ + vnet_main_t *vnm = vnet_get_main (); + memif_main_t *mm = &memif_main; + memif_socket_file_t *msf = + vec_elt_at_index (mm->socket_files, mif->socket_file_index); + clib_error_t *err; + + mif->flags |= MEMIF_IF_FLAG_DELETING; + vec_free (mif->local_disc_string); + vec_free (mif->remote_disc_string); + + /* bring down the interface */ + vnet_hw_interface_set_flags (vnm, mif->hw_if_index, 0); + vnet_sw_interface_set_flags (vnm, mif->sw_if_index, 0); + + err = clib_error_return (0, "interface deleted"); + memif_disconnect (mif, err); + clib_error_free (err); + + /* remove the interface */ + if (mif->mode == MEMIF_INTERFACE_MODE_IP) + vnet_delete_hw_interface (vnm, mif->hw_if_index); + else + ethernet_delete_interface (vnm, mif->hw_if_index); + mif->hw_if_index = ~0; + + /* free interface data structures */ + clib_spinlock_free (&mif->lockp); + mhash_unset (&msf->dev_instance_by_id, &mif->id, 0); + + /* remove socket file */ + if (--(msf->ref_cnt) == 0) + { + if (msf->is_listener) + { + uword *x; + memif_file_del_by_index (msf->clib_file_index); + vec_foreach (x, msf->pending_file_indices) + { + memif_file_del_by_index (*x); + } + vec_free (msf->pending_file_indices); + } + mhash_free (&msf->dev_instance_by_id); + hash_free (msf->dev_instance_by_fd); + mhash_unset (&mm->socket_file_index_by_filename, msf->filename, 0); + vec_free (msf->filename); + pool_put (mm->socket_files, msf); + } + + memset (mif, 0, sizeof (*mif)); + pool_put (mm->interfaces, mif); + + if (pool_elts (mm->interfaces) == 0) + vlib_process_signal_event (vm, memif_process_node.index, + MEMIF_PROCESS_EVENT_STOP, 0); + + return 0; +} + +/* *INDENT-OFF* */ +VNET_HW_INTERFACE_CLASS (memif_ip_hw_if_class, static) = +{ + .name = "memif-ip", + .flags = VNET_HW_INTERFACE_CLASS_FLAG_P2P, +}; +/* *INDENT-ON* */ + +int +memif_create_if (vlib_main_t * vm, memif_create_if_args_t * args) +{ + memif_main_t *mm = &memif_main; + vlib_thread_main_t *tm = vlib_get_thread_main (); + vnet_main_t *vnm = vnet_get_main (); + memif_if_t *mif = 0; + vnet_sw_interface_t *sw; + clib_error_t *error = 0; + int ret = 0; + uword *p; + vnet_hw_interface_t *hw; + memif_socket_file_t *msf = 0; + u8 *socket_filename; + int rv = 0; + + if (args->socket_filename == 0 || args->socket_filename[0] != '/') + { + clib_error_t *error; + error = vlib_unix_recursive_mkdir (vlib_unix_get_runtime_dir ()); + if (error) + { + clib_error_free (error); + return VNET_API_ERROR_SYSCALL_ERROR_1; + } + + if (args->socket_filename == 0) + socket_filename = format (0, "%s/%s%c", vlib_unix_get_runtime_dir (), + MEMIF_DEFAULT_SOCKET_FILENAME, 0); + else + socket_filename = format (0, "%s/%s%c", vlib_unix_get_runtime_dir (), + args->socket_filename, 0); + + } + else + socket_filename = vec_dup (args->socket_filename); + + p = mhash_get (&mm->socket_file_index_by_filename, socket_filename); + + if (p) + { + msf = vec_elt_at_index (mm->socket_files, p[0]); + + /* existing socket file can be either master or slave but cannot be both */ + if (!msf->is_listener != !args->is_master) + { + rv = VNET_API_ERROR_SUBIF_ALREADY_EXISTS; + goto done; + } + + p = mhash_get (&msf->dev_instance_by_id, &args->id); + if (p) + { + rv = VNET_API_ERROR_SUBIF_ALREADY_EXISTS; + goto done; + } + } + + /* Create new socket file */ + if (msf == 0) + { + struct stat file_stat; + /* If we are creating listener make sure file doesn't exist or if it + * exists thn delete it if it is old socket file */ + if (args->is_master && + (stat ((char *) socket_filename, &file_stat) == 0)) + { + if (S_ISSOCK (file_stat.st_mode)) + { + unlink ((char *) socket_filename); + } + else + { + error = clib_error_return (0, "File exists for %s", + socket_filename); + clib_error_report (error); + rv = VNET_API_ERROR_VALUE_EXIST; + goto done; + } + } + pool_get (mm->socket_files, msf); + memset (msf, 0, sizeof (memif_socket_file_t)); + mhash_init (&msf->dev_instance_by_id, sizeof (uword), + sizeof (memif_interface_id_t)); + msf->dev_instance_by_fd = hash_create (0, sizeof (uword)); + msf->filename = socket_filename; + msf->fd = -1; + msf->is_listener = (args->is_master != 0); + socket_filename = 0; + mhash_set (&mm->socket_file_index_by_filename, msf->filename, + msf - mm->socket_files, 0); + DBG ("creating socket file %s", msf->filename); + } + + pool_get (mm->interfaces, mif); + memset (mif, 0, sizeof (*mif)); + mif->dev_instance = mif - mm->interfaces; + mif->socket_file_index = msf - mm->socket_files; + mif->id = args->id; + mif->sw_if_index = mif->hw_if_index = mif->per_interface_next_index = ~0; + mif->conn_clib_file_index = ~0; + mif->conn_fd = -1; + mif->mode = args->mode; + if (args->secret) + mif->secret = vec_dup (args->secret); + + if (tm->n_vlib_mains > 1) + clib_spinlock_init (&mif->lockp); + + + if (mif->mode == MEMIF_INTERFACE_MODE_ETHERNET) + { + + if (!args->hw_addr_set) + { + f64 now = vlib_time_now (vm); + u32 rnd; + rnd = (u32) (now * 1e6); + rnd = random_u32 (&rnd); + + memcpy (args->hw_addr + 2, &rnd, sizeof (rnd)); + args->hw_addr[0] = 2; + args->hw_addr[1] = 0xfe; + } + error = ethernet_register_interface (vnm, memif_device_class.index, + mif->dev_instance, args->hw_addr, + &mif->hw_if_index, + memif_eth_flag_change); + } + else if (mif->mode == MEMIF_INTERFACE_MODE_IP) + { + mif->hw_if_index = + vnet_register_interface (vnm, memif_device_class.index, + mif->dev_instance, + memif_ip_hw_if_class.index, + mif->dev_instance); + } + else + error = clib_error_return (0, "unsupported interface mode"); + + if (error) + { + clib_error_report (error); + ret = VNET_API_ERROR_SYSCALL_ERROR_2; + goto error; + } + + sw = vnet_get_hw_sw_interface (vnm, mif->hw_if_index); + mif->sw_if_index = sw->sw_if_index; + + mif->cfg.log2_ring_size = args->log2_ring_size; + mif->cfg.buffer_size = args->buffer_size; + mif->cfg.num_s2m_rings = + args->is_master ? args->rx_queues : args->tx_queues; + mif->cfg.num_m2s_rings = + args->is_master ? args->tx_queues : args->rx_queues; + + args->sw_if_index = mif->sw_if_index; + + /* If this is new one, start listening */ + if (msf->is_listener && msf->ref_cnt == 0) + { + struct sockaddr_un un = { 0 }; + struct stat file_stat; + int on = 1; + + if ((msf->fd = socket (AF_UNIX, SOCK_SEQPACKET, 0)) < 0) + { + ret = VNET_API_ERROR_SYSCALL_ERROR_4; + goto error; + } + + un.sun_family = AF_UNIX; + strncpy ((char *) un.sun_path, (char *) msf->filename, + sizeof (un.sun_path) - 1); + + if (setsockopt (msf->fd, SOL_SOCKET, SO_PASSCRED, &on, sizeof (on)) < 0) + { + ret = VNET_API_ERROR_SYSCALL_ERROR_5; + goto error; + } + if (bind (msf->fd, (struct sockaddr *) &un, sizeof (un)) == -1) + { + ret = VNET_API_ERROR_SYSCALL_ERROR_6; + goto error; + } + if (listen (msf->fd, 1) == -1) + { + ret = VNET_API_ERROR_SYSCALL_ERROR_7; + goto error; + } + + if (stat ((char *) msf->filename, &file_stat) == -1) + { + ret = VNET_API_ERROR_SYSCALL_ERROR_8; + goto error; + } + + msf->clib_file_index = ~0; + clib_file_t template = { 0 }; + template.read_function = memif_conn_fd_accept_ready; + template.file_descriptor = msf->fd; + template.private_data = mif->socket_file_index; + memif_file_add (&msf->clib_file_index, &template); + } + + msf->ref_cnt++; + + if (args->is_master == 0) + mif->flags |= MEMIF_IF_FLAG_IS_SLAVE; + + hw = vnet_get_hw_interface (vnm, mif->hw_if_index); + hw->flags |= VNET_HW_INTERFACE_FLAG_SUPPORTS_INT_MODE; + vnet_hw_interface_set_input_node (vnm, mif->hw_if_index, + memif_input_node.index); + + mhash_set (&msf->dev_instance_by_id, &mif->id, mif->dev_instance, 0); + + if (pool_elts (mm->interfaces) == 1) + { + vlib_process_signal_event (vm, memif_process_node.index, + MEMIF_PROCESS_EVENT_START, 0); + } + goto done; + +error: + if (mif->hw_if_index != ~0) + { + if (mif->mode == MEMIF_INTERFACE_MODE_IP) + vnet_delete_hw_interface (vnm, mif->hw_if_index); + else + ethernet_delete_interface (vnm, mif->hw_if_index); + mif->hw_if_index = ~0; + } + memif_delete_if (vm, mif); + return ret; + +done: + vec_free (socket_filename); + return rv; +} + + +static clib_error_t * +memif_init (vlib_main_t * vm) +{ + memif_main_t *mm = &memif_main; + vlib_thread_main_t *tm = vlib_get_thread_main (); + + memset (mm, 0, sizeof (memif_main_t)); + + /* initialize binary API */ + memif_plugin_api_hookup (vm); + + mhash_init_c_string (&mm->socket_file_index_by_filename, sizeof (uword)); + + vec_validate_aligned (mm->rx_buffers, tm->n_vlib_mains - 1, + CLIB_CACHE_LINE_BYTES); + + return 0; +} + +VLIB_INIT_FUNCTION (memif_init); + +/* *INDENT-OFF* */ +VLIB_PLUGIN_REGISTER () = { + .version = VPP_BUILD_VER, + .description = "Packet Memory Interface (experimetal)", +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/memif/memif.h b/src/plugins/memif/memif.h new file mode 100644 index 00000000..11918eab --- /dev/null +++ b/src/plugins/memif/memif.h @@ -0,0 +1,185 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#ifndef _MEMIF_H_ +#define _MEMIF_H_ + +#ifndef MEMIF_CACHELINE_SIZE +#define MEMIF_CACHELINE_SIZE 64 +#endif + +#define MEMIF_COOKIE 0x3E31F10 +#define MEMIF_VERSION_MAJOR 1 +#define MEMIF_VERSION_MINOR 0 +#define MEMIF_VERSION ((MEMIF_VERSION_MAJOR << 8) | MEMIF_VERSION_MINOR) + +/* + * Type definitions + */ + +typedef enum memif_msg_type +{ + MEMIF_MSG_TYPE_NONE = 0, + MEMIF_MSG_TYPE_ACK = 1, + MEMIF_MSG_TYPE_HELLO = 2, + MEMIF_MSG_TYPE_INIT = 3, + MEMIF_MSG_TYPE_ADD_REGION = 4, + MEMIF_MSG_TYPE_ADD_RING = 5, + MEMIF_MSG_TYPE_CONNECT = 6, + MEMIF_MSG_TYPE_CONNECTED = 7, + MEMIF_MSG_TYPE_DISCONNECT = 8, +} memif_msg_type_t; + +typedef enum +{ + MEMIF_RING_S2M = 0, + MEMIF_RING_M2S = 1 +} memif_ring_type_t; + +typedef enum +{ + MEMIF_INTERFACE_MODE_ETHERNET = 0, + MEMIF_INTERFACE_MODE_IP = 1, + MEMIF_INTERFACE_MODE_PUNT_INJECT = 2, +} memif_interface_mode_t; + +typedef uint16_t memif_region_index_t; +typedef uint64_t memif_region_offset_t; +typedef uint64_t memif_region_size_t; +typedef uint16_t memif_ring_index_t; +typedef uint32_t memif_interface_id_t; +typedef uint16_t memif_version_t; +typedef uint8_t memif_log2_ring_size_t; + +/* + * Socket messages + */ + +typedef struct __attribute__ ((packed)) +{ + uint8_t name[32]; + memif_version_t min_version; + memif_version_t max_version; + memif_region_index_t max_region; + memif_ring_index_t max_m2s_ring; + memif_ring_index_t max_s2m_ring; + memif_log2_ring_size_t max_log2_ring_size; +} memif_msg_hello_t; + +typedef struct __attribute__ ((packed)) +{ + memif_version_t version; + memif_interface_id_t id; + memif_interface_mode_t mode:8; + uint8_t secret[24]; + uint8_t name[32]; +} memif_msg_init_t; + +typedef struct __attribute__ ((packed)) +{ + memif_region_index_t index; + memif_region_size_t size; +} memif_msg_add_region_t; + +typedef struct __attribute__ ((packed)) +{ + uint16_t flags; +#define MEMIF_MSG_ADD_RING_FLAG_S2M (1 << 0) + memif_ring_index_t index; + memif_region_index_t region; + memif_region_offset_t offset; + memif_log2_ring_size_t log2_ring_size; +} memif_msg_add_ring_t; + +typedef struct __attribute__ ((packed)) +{ + uint8_t if_name[32]; +} memif_msg_connect_t; + +typedef struct __attribute__ ((packed)) +{ + uint8_t if_name[32]; +} memif_msg_connected_t; + +typedef struct __attribute__ ((packed)) +{ + uint32_t code; + uint8_t string[96]; +} memif_msg_disconnect_t; + +typedef struct __attribute__ ((packed, aligned (128))) +{ + memif_msg_type_t type:16; + union + { + memif_msg_hello_t hello; + memif_msg_init_t init; + memif_msg_add_region_t add_region; + memif_msg_add_ring_t add_ring; + memif_msg_connect_t connect; + memif_msg_connected_t connected; + memif_msg_disconnect_t disconnect; + }; +} memif_msg_t; + +_Static_assert (sizeof (memif_msg_t) == 128, + "Size of memif_msg_t must be 128"); + +/* + * Ring and Descriptor Layout + */ + +typedef struct __attribute__ ((packed)) +{ + uint16_t flags; +#define MEMIF_DESC_FLAG_NEXT (1 << 0) + memif_region_index_t region; + uint32_t buffer_length; + uint32_t length; + uint8_t reserved[4]; + memif_region_offset_t offset; + uint64_t metadata; +} memif_desc_t; + +_Static_assert (sizeof (memif_desc_t) == 32, + "Size of memif_dsct_t must be 32"); + +#define MEMIF_CACHELINE_ALIGN_MARK(mark) \ + uint8_t mark[0] __attribute__((aligned(MEMIF_CACHELINE_SIZE))) + +typedef struct +{ + MEMIF_CACHELINE_ALIGN_MARK (cacheline0); + uint32_t cookie; + uint16_t flags; +#define MEMIF_RING_FLAG_MASK_INT 1 + volatile uint16_t head; + MEMIF_CACHELINE_ALIGN_MARK (cacheline1); + volatile uint16_t tail; + MEMIF_CACHELINE_ALIGN_MARK (cacheline2); + memif_desc_t desc[0]; +} memif_ring_t; + +#endif /* _MEMIF_H_ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/memif/memif_all_api_h.h b/src/plugins/memif/memif_all_api_h.h new file mode 100644 index 00000000..9729ec16 --- /dev/null +++ b/src/plugins/memif/memif_all_api_h.h @@ -0,0 +1,18 @@ +/* + * memif_all_api_h.h - plug-in api #include file + * + * Copyright (c) <current-year> <your-organization> + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* Include the generated file, see BUILT_SOURCES in Makefile.am */ +#include <memif/memif.api.h> diff --git a/src/plugins/memif/memif_api.c b/src/plugins/memif/memif_api.c new file mode 100644 index 00000000..07347bc0 --- /dev/null +++ b/src/plugins/memif/memif_api.c @@ -0,0 +1,350 @@ +/* + *------------------------------------------------------------------ + * memif_api.c - memif api + * + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#include <vlib/vlib.h> +#include <vnet/ethernet/ethernet.h> +#include <vlib/unix/unix.h> +#include <memif/memif.h> +#include <memif/private.h> + +#include <vlibapi/api.h> +#include <vlibmemory/api.h> +#include <vlibsocket/api.h> + +/* define message IDs */ +#include <memif/memif_msg_enum.h> + +/* define message structures */ +#define vl_typedefs +#include <memif/memif_all_api_h.h> +#undef vl_typedefs + +/* define generated endian-swappers */ +#define vl_endianfun +#include <memif/memif_all_api_h.h> +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__) +#define vl_printfun +#include <memif/memif_all_api_h.h> +#undef vl_printfun + +/* Get the API version number */ +#define vl_api_version(n,v) static u32 api_version=(v); +#include <memif/memif_all_api_h.h> +#undef vl_api_version + +/* + * A handy macro to set up a message reply. + * Assumes that the following variables are available: + * mp - pointer to request message + * rmp - pointer to reply message type + * rv - return value + */ +#define REPLY_MACRO(t) \ +do { \ + unix_shared_memory_queue_t * q = \ + vl_api_client_index_to_input_queue (mp->client_index); \ + if (!q) \ + return; \ + \ + rmp = vl_msg_api_alloc (sizeof (*rmp)); \ + rmp->_vl_msg_id = htons ((t)+mm->msg_id_base); \ + rmp->context = mp->context; \ + rmp->retval = htonl (rv); \ + \ + vl_msg_api_send_shmem (q, (u8 *)&rmp); \ +} while(0); + +#define REPLY_MACRO2(t, body) \ +do { \ + unix_shared_memory_queue_t * q = \ + vl_api_client_index_to_input_queue (mp->client_index); \ + if (!q) \ + return; \ + \ + rmp = vl_msg_api_alloc (sizeof (*rmp)); \ + rmp->_vl_msg_id = htons ((t)+mm->msg_id_base); \ + rmp->context = mp->context; \ + rmp->retval = htonl (rv); \ + do {body;} while (0); \ + vl_msg_api_send_shmem (q, (u8 *)&rmp); \ +} while(0); + +#define foreach_memif_plugin_api_msg \ +_(MEMIF_CREATE, memif_create) \ +_(MEMIF_DELETE, memif_delete) \ +_(MEMIF_DUMP, memif_dump) \ + +/** + * @brief Message handler for memif_create API. + * @param mp vl_api_memif_create_t * mp the api message + */ +void +vl_api_memif_create_t_handler (vl_api_memif_create_t * mp) +{ + memif_main_t *mm = &memif_main; + vlib_main_t *vm = vlib_get_main (); + vl_api_memif_create_reply_t *rmp; + memif_create_if_args_t args = { 0 }; + u32 ring_size = MEMIF_DEFAULT_RING_SIZE; + static const u8 empty_hw_addr[6]; + int rv = 0; + + /* id */ + args.id = clib_net_to_host_u32 (mp->id); + + /* socket filename */ + mp->socket_filename[ARRAY_LEN (mp->socket_filename) - 1] = 0; + if (strlen ((char *) mp->socket_filename) > 0) + { + vec_validate (args.socket_filename, + strlen ((char *) mp->socket_filename)); + strncpy ((char *) args.socket_filename, (char *) mp->socket_filename, + vec_len (args.socket_filename)); + } + + /* secret */ + mp->secret[ARRAY_LEN (mp->secret) - 1] = 0; + if (strlen ((char *) mp->secret) > 0) + { + vec_validate (args.secret, strlen ((char *) mp->secret)); + strncpy ((char *) args.secret, (char *) mp->secret, + vec_len (args.secret)); + } + + /* role */ + args.is_master = (mp->role == 0); + + /* mode */ + args.mode = mp->mode; + + /* rx/tx queues */ + if (args.is_master == 0) + { + args.rx_queues = MEMIF_DEFAULT_RX_QUEUES; + args.tx_queues = MEMIF_DEFAULT_TX_QUEUES; + if (mp->rx_queues) + { + args.rx_queues = mp->rx_queues; + } + if (mp->tx_queues) + { + args.tx_queues = mp->tx_queues; + } + } + + /* ring size */ + if (mp->ring_size) + { + ring_size = ntohl (mp->ring_size); + } + if (!is_pow2 (ring_size)) + { + rv = VNET_API_ERROR_INVALID_ARGUMENT; + goto reply; + } + args.log2_ring_size = min_log2 (ring_size); + + /* buffer size */ + args.buffer_size = MEMIF_DEFAULT_BUFFER_SIZE; + if (mp->buffer_size) + { + args.buffer_size = ntohs (mp->buffer_size); + } + + /* MAC address */ + if (memcmp (mp->hw_addr, empty_hw_addr, 6) != 0) + { + memcpy (args.hw_addr, mp->hw_addr, 6); + args.hw_addr_set = 1; + } + + rv = memif_create_if (vm, &args); + + vec_free (args.socket_filename); + vec_free (args.secret); + +reply: + /* *INDENT-OFF* */ + REPLY_MACRO2 (VL_API_MEMIF_CREATE_REPLY, + ({ + rmp->sw_if_index = htonl (args.sw_if_index); + })); + /* *INDENT-ON* */ +} + +/** + * @brief Message handler for memif_delete API. + * @param mp vl_api_memif_delete_t * mp the api message + */ +void +vl_api_memif_delete_t_handler (vl_api_memif_delete_t * mp) +{ + memif_main_t *mm = &memif_main; + vlib_main_t *vm = vlib_get_main (); + vnet_main_t *vnm = vnet_get_main (); + vl_api_memif_delete_reply_t *rmp; + vnet_hw_interface_t *hi = + vnet_get_sup_hw_interface (vnm, ntohl (mp->sw_if_index)); + memif_if_t *mif; + int rv = 0; + + if (hi == NULL || memif_device_class.index != hi->dev_class_index) + rv = VNET_API_ERROR_INVALID_SW_IF_INDEX; + else + { + mif = pool_elt_at_index (mm->interfaces, hi->dev_instance); + rv = memif_delete_if (vm, mif); + } + + REPLY_MACRO (VL_API_MEMIF_DELETE_REPLY); +} + +static void +send_memif_details (unix_shared_memory_queue_t * q, + memif_if_t * mif, + vnet_sw_interface_t * swif, + u8 * interface_name, u32 context) +{ + vl_api_memif_details_t *mp; + vnet_main_t *vnm = vnet_get_main (); + memif_main_t *mm = &memif_main; + memif_socket_file_t *msf = vec_elt_at_index (mm->socket_files, + mif->socket_file_index); + vnet_hw_interface_t *hwif; + + hwif = vnet_get_sup_hw_interface (vnm, swif->sw_if_index); + + mp = vl_msg_api_alloc (sizeof (*mp)); + memset (mp, 0, sizeof (*mp)); + + mp->_vl_msg_id = htons (VL_API_MEMIF_DETAILS + mm->msg_id_base); + mp->context = context; + + mp->sw_if_index = htonl (swif->sw_if_index); + strncpy ((char *) mp->if_name, + (char *) interface_name, ARRAY_LEN (mp->if_name) - 1); + memcpy (mp->hw_addr, hwif->hw_address, ARRAY_LEN (mp->hw_addr)); + + mp->id = clib_host_to_net_u32 (mif->id); + mp->role = (mif->flags & MEMIF_IF_FLAG_IS_SLAVE) ? 1 : 0; + strncpy ((char *) mp->socket_filename, + (char *) msf->filename, ARRAY_LEN (mp->socket_filename) - 1); + + mp->ring_size = htonl (1 << mif->run.log2_ring_size); + mp->buffer_size = htons (mif->run.buffer_size); + + mp->admin_up_down = (swif->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) ? 1 : 0; + mp->link_up_down = (hwif->flags & VNET_HW_INTERFACE_FLAG_LINK_UP) ? 1 : 0; + + vl_msg_api_send_shmem (q, (u8 *) & mp); +} + +/** + * @brief Message handler for memif_dump API. + * @param mp vl_api_memif_dump_t * mp the api message + */ +void +vl_api_memif_dump_t_handler (vl_api_memif_dump_t * mp) +{ + memif_main_t *mm = &memif_main; + vnet_main_t *vnm = vnet_get_main (); + vnet_sw_interface_t *swif; + memif_if_t *mif; + u8 *if_name = 0; + unix_shared_memory_queue_t *q; + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (q == 0) + return; + + /* *INDENT-OFF* */ + pool_foreach (mif, mm->interfaces, + ({ + swif = vnet_get_sw_interface (vnm, mif->sw_if_index); + + if_name = format (if_name, "%U%c", + format_vnet_sw_interface_name, + vnm, swif, 0); + + send_memif_details (q, mif, swif, if_name, mp->context); + _vec_len (if_name) = 0; + })); + /* *INDENT-ON* */ + + vec_free (if_name); +} + +#define vl_msg_name_crc_list +#include <memif/memif_all_api_h.h> +#undef vl_msg_name_crc_list + +static void +setup_message_id_table (memif_main_t * mm, api_main_t * am) +{ +#define _(id,n,crc) \ + vl_msg_api_add_msg_name_crc (am, #n "_" #crc, id + mm->msg_id_base); + foreach_vl_msg_name_crc_memif; +#undef _ +} + +/* Set up the API message handling tables */ +clib_error_t * +memif_plugin_api_hookup (vlib_main_t * vm) +{ + memif_main_t *mm = &memif_main; + api_main_t *am = &api_main; + u8 *name; + + /* Construct the API name */ + name = format (0, "memif_%08x%c", api_version, 0); + + /* Ask for a correctly-sized block of API message decode slots */ + mm->msg_id_base = vl_msg_api_get_msg_ids + ((char *) name, VL_MSG_FIRST_AVAILABLE); + +#define _(N,n) \ + vl_msg_api_set_handlers((VL_API_##N + mm->msg_id_base), \ + #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_memif_plugin_api_msg; +#undef _ + + /* + * Set up the (msg_name, crc, message-id) table + */ + setup_message_id_table (mm, am); + + vec_free (name); + return 0; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/memif/memif_msg_enum.h b/src/plugins/memif/memif_msg_enum.h new file mode 100644 index 00000000..74efee00 --- /dev/null +++ b/src/plugins/memif/memif_msg_enum.h @@ -0,0 +1,31 @@ +/* + * memif_msg_enum.h - vpp engine plug-in message enumeration + * + * Copyright (c) <current-year> <your-organization> + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef included_memif_msg_enum_h +#define included_memif_msg_enum_h + +#include <vppinfra/byte_order.h> + +#define vl_msg_id(n,h) n, +typedef enum +{ +#include <memif/memif_all_api_h.h> + /* We'll want to know how many messages IDs we need... */ + VL_MSG_FIRST_AVAILABLE, +} vl_msg_id_t; +#undef vl_msg_id + +#endif /* included_memif_msg_enum_h */ diff --git a/src/plugins/memif/memif_test.c b/src/plugins/memif/memif_test.c new file mode 100644 index 00000000..4ca7526d --- /dev/null +++ b/src/plugins/memif/memif_test.c @@ -0,0 +1,372 @@ +/* + * memif VAT support + * + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <inttypes.h> + +#include <vat/vat.h> +#include <vlibapi/api.h> +#include <vlibmemory/api.h> +#include <vlibsocket/api.h> +#include <vppinfra/error.h> +#include <vnet/ip/ip.h> +#include <memif/memif.h> +#include <memif/private.h> + +#define __plugin_msg_base memif_test_main.msg_id_base +#include <vlibapi/vat_helper_macros.h> + +/* declare message IDs */ +#include <memif/memif_msg_enum.h> + +/* Get CRC codes of the messages defined outside of this plugin */ +#define vl_msg_name_crc_list +#include <vpp/api/vpe_all_api_h.h> +#undef vl_msg_name_crc_list + +/* define message structures */ +#define vl_typedefs +#include <vpp/api/vpe_all_api_h.h> +#include <memif/memif_all_api_h.h> +#undef vl_typedefs + +/* declare message handlers for each api */ + +#define vl_endianfun /* define message structures */ +#include <memif/memif_all_api_h.h> +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) +#define vl_printfun +#include <memif/memif_all_api_h.h> +#undef vl_printfun + +/* Get the API version number. */ +#define vl_api_version(n,v) static u32 api_version=(v); +#include <memif/memif_all_api_h.h> +#undef vl_api_version + +typedef struct +{ + /* API message ID base */ + u16 msg_id_base; + u32 ping_id; + vat_main_t *vat_main; +} memif_test_main_t; + +memif_test_main_t memif_test_main; + +/* standard reply handlers */ +#define foreach_standard_reply_retval_handler \ +_(memif_delete_reply) + +#define _(n) \ + static void vl_api_##n##_t_handler \ + (vl_api_##n##_t * mp) \ + { \ + vat_main_t * vam = memif_test_main.vat_main; \ + i32 retval = ntohl(mp->retval); \ + if (vam->async_mode) { \ + vam->async_errors += (retval < 0); \ + } else { \ + vam->retval = retval; \ + vam->result_ready = 1; \ + } \ + } +foreach_standard_reply_retval_handler; +#undef _ + +/* + * Table of message reply handlers, must include boilerplate handlers + * we just generated + */ +#define foreach_vpe_api_reply_msg \ +_(MEMIF_CREATE_REPLY, memif_create_reply) \ +_(MEMIF_DELETE_REPLY, memif_delete_reply) \ +_(MEMIF_DETAILS, memif_details) + +static uword +unformat_memif_queues (unformat_input_t * input, va_list * args) +{ + u32 *rx_queues = va_arg (*args, u32 *); + u32 *tx_queues = va_arg (*args, u32 *); + + if (unformat (input, "rx-queues %u", rx_queues)) + ; + if (unformat (input, "tx-queues %u", tx_queues)) + ; + + return 1; +} + +/* memif-create API */ +static int +api_memif_create (vat_main_t * vam) +{ + unformat_input_t *i = vam->input; + vl_api_memif_create_t *mp; + u32 id = 0; + u8 *socket_filename = 0; + u8 *secret = 0; + u8 role = 1; + u32 ring_size = 0; + u32 buffer_size = 0; + u8 hw_addr[6] = { 0 }; + u32 rx_queues = MEMIF_DEFAULT_RX_QUEUES; + u32 tx_queues = MEMIF_DEFAULT_TX_QUEUES; + int ret; + u8 mode = MEMIF_INTERFACE_MODE_ETHERNET; + + while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT) + { + if (unformat (i, "id %u", &id)) + ; + else if (unformat (i, "socket %s", &socket_filename)) + ; + else if (unformat (i, "secret %s", &secret)) + ; + else if (unformat (i, "ring_size %u", &ring_size)) + ; + else if (unformat (i, "buffer_size %u", &buffer_size)) + ; + else if (unformat (i, "master")) + role = 0; + else if (unformat (i, "slave %U", + unformat_memif_queues, &rx_queues, &tx_queues)) + role = 1; + else if (unformat (i, "mode ip")) + mode = MEMIF_INTERFACE_MODE_IP; + else if (unformat (i, "hw_addr %U", unformat_ethernet_address, hw_addr)) + ; + else + { + clib_warning ("unknown input '%U'", format_unformat_error, i); + return -99; + } + } + + if (!is_pow2 (ring_size)) + { + errmsg ("ring size must be power of 2\n"); + return -99; + } + + if (rx_queues > 255 || rx_queues < 1) + { + errmsg ("rx queue must be between 1 - 255\n"); + return -99; + } + + if (tx_queues > 255 || tx_queues < 1) + { + errmsg ("tx queue must be between 1 - 255\n"); + return -99; + } + + M (MEMIF_CREATE, mp); + + mp->mode = mode; + mp->id = clib_host_to_net_u32 (id); + mp->role = role; + mp->ring_size = clib_host_to_net_u32 (ring_size); + mp->buffer_size = clib_host_to_net_u16 (buffer_size & 0xffff); + if (socket_filename != 0) + { + strncpy ((char *) mp->socket_filename, (char *) socket_filename, 127); + vec_free (socket_filename); + } + if (secret != 0) + { + strncpy ((char *) mp->secret, (char *) secret, 16); + vec_free (secret); + } + memcpy (mp->hw_addr, hw_addr, 6); + mp->rx_queues = rx_queues; + mp->tx_queues = tx_queues; + + S (mp); + W (ret); + return ret; +} + +/* memif-create reply handler */ +static void vl_api_memif_create_reply_t_handler + (vl_api_memif_create_reply_t * mp) +{ + vat_main_t *vam = memif_test_main.vat_main; + i32 retval = ntohl (mp->retval); + + if (retval == 0) + { + fformat (vam->ofp, "created memif with sw_if_index %d\n", + ntohl (mp->sw_if_index)); + } + + vam->retval = retval; + vam->result_ready = 1; +} + +/* memif-delete API */ +static int +api_memif_delete (vat_main_t * vam) +{ + unformat_input_t *i = vam->input; + vl_api_memif_delete_t *mp; + u32 sw_if_index = 0; + u8 index_defined = 0; + int ret; + + while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT) + { + if (unformat (i, "sw_if_index %u", &sw_if_index)) + index_defined = 1; + else + { + clib_warning ("unknown input '%U'", format_unformat_error, i); + return -99; + } + } + + if (!index_defined) + { + errmsg ("missing sw_if_index\n"); + return -99; + } + + M (MEMIF_DELETE, mp); + + mp->sw_if_index = clib_host_to_net_u32 (sw_if_index); + + S (mp); + W (ret); + return ret; +} + +/* memif-dump API */ +static int +api_memif_dump (vat_main_t * vam) +{ + memif_test_main_t *mm = &memif_test_main; + vl_api_memif_dump_t *mp; + vl_api_control_ping_t *mp_ping; + int ret; + + if (vam->json_output) + { + clib_warning ("JSON output not supported for memif_dump"); + return -99; + } + + M (MEMIF_DUMP, mp); + S (mp); + + /* Use a control ping for synchronization */ + mp_ping = vl_msg_api_alloc_as_if_client (sizeof (*mp_ping)); + mp_ping->_vl_msg_id = htons (mm->ping_id); + mp_ping->client_index = vam->my_client_index; + + fformat (vam->ofp, "Sending ping id=%d\n", mm->ping_id); + + vam->result_ready = 0; + S (mp_ping); + + W (ret); + return ret; +} + +/* memif-details message handler */ +static void vl_api_memif_details_t_handler (vl_api_memif_details_t * mp) +{ + vat_main_t *vam = memif_test_main.vat_main; + + fformat (vam->ofp, "%s: sw_if_index %u mac %U\n" + " id %u socket %s role %s\n" + " ring_size %u buffer_size %u\n" + " state %s link %s\n", + mp->if_name, ntohl (mp->sw_if_index), format_ethernet_address, + mp->hw_addr, clib_net_to_host_u32 (mp->id), mp->socket_filename, + mp->role ? "slave" : "master", + ntohl (mp->ring_size), ntohs (mp->buffer_size), + mp->admin_up_down ? "up" : "down", + mp->link_up_down ? "up" : "down"); +} + +/* + * List of messages that the api test plugin sends, + * and that the data plane plugin processes + */ +#define foreach_vpe_api_msg \ +_(memif_create, "[id <id>] [socket <path>] [ring_size <size>] " \ + "[buffer_size <size>] [hw_addr <mac_address>] " \ + "[secret <string>] [mode ip] <master|slave>") \ +_(memif_delete, "<sw_if_index>") \ +_(memif_dump, "") + +static void +memif_vat_api_hookup (vat_main_t * vam) +{ + memif_test_main_t *mm __attribute__ ((unused)) = &memif_test_main; + /* Hook up handlers for replies from the data plane plug-in */ +#define _(N,n) \ + vl_msg_api_set_handlers((VL_API_##N + mm->msg_id_base), \ + #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_vpe_api_reply_msg; +#undef _ + + /* API messages we can send */ +#define _(n,h) \ + hash_set_mem (vam->function_by_name, #n, api_##n); + foreach_vpe_api_msg; +#undef _ + + /* Help strings */ +#define _(n,h) hash_set_mem (vam->help_by_name, #n, h); + foreach_vpe_api_msg; +#undef _ +} + +clib_error_t * +vat_plugin_register (vat_main_t * vam) +{ + memif_test_main_t *mm = &memif_test_main; + u8 *name; + + mm->vat_main = vam; + + /* Ask the vpp engine for the first assigned message-id */ + name = format (0, "memif_%08x%c", api_version, 0); + mm->msg_id_base = vl_client_get_first_plugin_msg_id ((char *) name); + + /* Get the control ping ID */ +#define _(id,n,crc) \ + const char *id ## _CRC __attribute__ ((unused)) = #n "_" #crc; + foreach_vl_msg_name_crc_vpe; +#undef _ + mm->ping_id = vl_api_get_msg_index ((u8 *) (VL_API_CONTROL_PING_CRC)); + + if (mm->msg_id_base != (u16) ~0) + memif_vat_api_hookup (vam); + + vec_free (name); + + return 0; +} diff --git a/src/plugins/memif/node.c b/src/plugins/memif/node.c new file mode 100644 index 00000000..4acc7149 --- /dev/null +++ b/src/plugins/memif/node.c @@ -0,0 +1,533 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#define _GNU_SOURCE +#include <stdint.h> +#include <net/if.h> +#include <sys/ioctl.h> +#include <sys/uio.h> + +#include <vlib/vlib.h> +#include <vlib/unix/unix.h> +#include <vnet/ethernet/ethernet.h> +#include <vnet/devices/devices.h> +#include <vnet/feature/feature.h> + +#include <memif/memif.h> +#include <memif/private.h> + +#define foreach_memif_input_error \ + _(NOT_IP, "not ip packet") + +typedef enum +{ +#define _(f,s) MEMIF_INPUT_ERROR_##f, + foreach_memif_input_error +#undef _ + MEMIF_INPUT_N_ERROR, +} memif_input_error_t; + +static char *memif_input_error_strings[] = { +#define _(n,s) s, + foreach_memif_input_error +#undef _ +}; + +typedef struct +{ + u32 next_index; + u32 hw_if_index; + u16 ring; +} memif_input_trace_t; + +static u8 * +format_memif_input_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + memif_input_trace_t *t = va_arg (*args, memif_input_trace_t *); + uword indent = format_get_indent (s); + + s = format (s, "memif: hw_if_index %d next-index %d", + t->hw_if_index, t->next_index); + s = format (s, "\n%Uslot: ring %u", format_white_space, indent + 2, + t->ring); + return s; +} + +static_always_inline void +memif_prefetch (vlib_main_t * vm, u32 bi) +{ + vlib_buffer_t *b = vlib_get_buffer (vm, bi); + vlib_prefetch_buffer_header (b, STORE); + CLIB_PREFETCH (b->data, CLIB_CACHE_LINE_BYTES, STORE); +} + +static_always_inline void +memif_buffer_add_to_chain (vlib_main_t * vm, u32 bi, u32 first_bi, + u32 prev_bi) +{ + vlib_buffer_t *b = vlib_get_buffer (vm, bi); + vlib_buffer_t *first_b = vlib_get_buffer (vm, first_bi); + vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_bi); + + /* update first buffer */ + first_b->total_length_not_including_first_buffer += b->current_length; + + /* update previous buffer */ + prev_b->next_buffer = bi; + prev_b->flags |= VLIB_BUFFER_NEXT_PRESENT; + + /* update current buffer */ + b->next_buffer = 0; +} + +/** + * @brief Copy buffer from rx ring + * + * @param * vm (in) + * @param * mif (in) pointer to memif interface + * @param * ring (in) pointer to memif ring + * @param * rd (in) pointer to ring data + * @param ring_size (in) ring size + * @param * n_free_bufs (in/out) the number of free vlib buffers available + * @param ** first_b (out) the first vlib buffer pointer + * @param * first_bi (out) the first vlib buffer index + * @param * bi (in/out) the current buffer index + * #param * num_slots (in/out) the number of descriptors available to read + * + * @return total bytes read from rx ring also written to vlib buffers + */ +static_always_inline uword +memif_copy_buffer_from_rx_ring (vlib_main_t * vm, memif_if_t * mif, + memif_ring_t * ring, memif_queue_t * mq, + u16 ring_size, u32 n_buffer_bytes, + u32 * n_free_bufs, vlib_buffer_t ** first_b, + u32 * first_bi, u32 * bi, u16 * num_slots) +{ + memif_main_t *nm = &memif_main; + u32 thread_index = vlib_get_thread_index (); + u32 total_bytes = 0, offset = 0; + u32 data_len; + u32 bytes_to_copy; + void *mb; + vlib_buffer_t *b; + u16 mask = ring_size - 1; + u32 prev_bi; + u16 last_head; + + while (*num_slots) + { + data_len = ring->desc[mq->last_head].length; + while (data_len && (*n_free_bufs)) + { + /* get empty buffer */ + u32 last_buf = vec_len (nm->rx_buffers[thread_index]) - 1; + prev_bi = *bi; + *bi = nm->rx_buffers[thread_index][last_buf]; + b = vlib_get_buffer (vm, *bi); + _vec_len (nm->rx_buffers[thread_index]) = last_buf; + (*n_free_bufs)--; + if (PREDICT_FALSE (*n_free_bufs == 0)) + { + *n_free_bufs += + vlib_buffer_alloc (vm, + &nm->rx_buffers[thread_index] + [*n_free_bufs], ring_size); + _vec_len (nm->rx_buffers[thread_index]) = *n_free_bufs; + } + + if (last_buf > 4) + { + memif_prefetch (vm, nm->rx_buffers[thread_index][last_buf - 2]); + memif_prefetch (vm, nm->rx_buffers[thread_index][last_buf - 3]); + } + + /* copy buffer */ + bytes_to_copy = + data_len > n_buffer_bytes ? n_buffer_bytes : data_len; + b->current_data = 0; + mb = memif_get_buffer (mif, ring, mq->last_head); + clib_memcpy (vlib_buffer_get_current (b), mb + offset, + CLIB_CACHE_LINE_BYTES); + if (bytes_to_copy > CLIB_CACHE_LINE_BYTES) + clib_memcpy (vlib_buffer_get_current (b) + CLIB_CACHE_LINE_BYTES, + mb + CLIB_CACHE_LINE_BYTES + offset, + bytes_to_copy - CLIB_CACHE_LINE_BYTES); + + /* fill buffer header */ + b->current_length = bytes_to_copy; + + if (total_bytes == 0) + { + /* fill buffer metadata */ + b->total_length_not_including_first_buffer = 0; + b->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID; + vnet_buffer (b)->sw_if_index[VLIB_RX] = mif->sw_if_index; + vnet_buffer (b)->sw_if_index[VLIB_TX] = (u32) ~ 0; + *first_bi = *bi; + *first_b = vlib_get_buffer (vm, *first_bi); + } + else + memif_buffer_add_to_chain (vm, *bi, *first_bi, prev_bi); + + offset += bytes_to_copy; + total_bytes += bytes_to_copy; + data_len -= bytes_to_copy; + } + last_head = mq->last_head; + /* Advance to next descriptor */ + mq->last_head = (mq->last_head + 1) & mask; + offset = 0; + (*num_slots)--; + if ((ring->desc[last_head].flags & MEMIF_DESC_FLAG_NEXT) == 0) + break; + } + + return (total_bytes); +} + + +static_always_inline u32 +memif_next_from_ip_hdr (vlib_node_runtime_t * node, vlib_buffer_t * b) +{ + u8 *ptr = vlib_buffer_get_current (b); + u8 v = *ptr & 0xf0; + + if (PREDICT_TRUE (v == 0x40)) + return VNET_DEVICE_INPUT_NEXT_IP4_NCS_INPUT; + else if (PREDICT_TRUE (v == 0x60)) + return VNET_DEVICE_INPUT_NEXT_IP6_INPUT; + + b->error = node->errors[MEMIF_INPUT_ERROR_NOT_IP]; + return VNET_DEVICE_INPUT_NEXT_DROP; +} + +static_always_inline uword +memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame, memif_if_t * mif, + memif_ring_type_t type, u16 qid, + memif_interface_mode_t mode) +{ + vnet_main_t *vnm = vnet_get_main (); + memif_ring_t *ring; + memif_queue_t *mq; + u16 head; + u32 next_index; + uword n_trace = vlib_get_trace_count (vm, node); + memif_main_t *nm = &memif_main; + u32 n_rx_packets = 0; + u32 n_rx_bytes = 0; + u32 *to_next = 0; + u32 n_free_bufs; + u32 b0_total, b1_total; + u32 thread_index = vlib_get_thread_index (); + u16 ring_size, mask, num_slots; + u32 n_buffer_bytes = vlib_buffer_free_list_buffer_size (vm, + VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + + mq = vec_elt_at_index (mif->rx_queues, qid); + ring = mq->ring; + ring_size = 1 << mq->log2_ring_size; + mask = ring_size - 1; + + if (mode == MEMIF_INTERFACE_MODE_IP) + { + next_index = VNET_DEVICE_INPUT_NEXT_IP6_INPUT; + } + else + { + next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT; + } + + n_free_bufs = vec_len (nm->rx_buffers[thread_index]); + if (PREDICT_FALSE (n_free_bufs < ring_size)) + { + vec_validate (nm->rx_buffers[thread_index], + ring_size + n_free_bufs - 1); + n_free_bufs += + vlib_buffer_alloc (vm, &nm->rx_buffers[thread_index][n_free_bufs], + ring_size); + _vec_len (nm->rx_buffers[thread_index]) = n_free_bufs; + } + + head = ring->head; + if (head == mq->last_head) + return 0; + + if (head > mq->last_head) + num_slots = head - mq->last_head; + else + num_slots = ring_size - mq->last_head + head; + + while (num_slots) + { + u32 n_left_to_next; + u32 next0 = next_index; + u32 next1 = next_index; + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (num_slots > 11 && n_left_to_next > 2) + { + if (PREDICT_TRUE (mq->last_head + 5 < ring_size)) + { + CLIB_PREFETCH (memif_get_buffer (mif, ring, mq->last_head + 2), + CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (memif_get_buffer (mif, ring, mq->last_head + 3), + CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (&ring->desc[mq->last_head + 4], + CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (&ring->desc[mq->last_head + 5], + CLIB_CACHE_LINE_BYTES, LOAD); + } + else + { + CLIB_PREFETCH (memif_get_buffer + (mif, ring, (mq->last_head + 2) % mask), + CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (memif_get_buffer + (mif, ring, (mq->last_head + 3) % mask), + CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (&ring->desc[(mq->last_head + 4) % mask], + CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (&ring->desc[(mq->last_head + 5) % mask], + CLIB_CACHE_LINE_BYTES, LOAD); + } + + vlib_buffer_t *first_b0 = 0; + u32 bi0 = 0, first_bi0 = 0; + b0_total = memif_copy_buffer_from_rx_ring (vm, mif, ring, mq, + ring_size, + n_buffer_bytes, + &n_free_bufs, &first_b0, + &first_bi0, &bi0, + &num_slots); + + vlib_buffer_t *first_b1 = 0; + u32 bi1 = 0, first_bi1 = 0; + b1_total = memif_copy_buffer_from_rx_ring (vm, mif, ring, mq, + ring_size, + n_buffer_bytes, + &n_free_bufs, &first_b1, + &first_bi1, &bi1, + &num_slots); + + /* enqueue buffer */ + to_next[0] = first_bi0; + to_next[1] = first_bi1; + to_next += 2; + n_left_to_next -= 2; + + + if (mode == MEMIF_INTERFACE_MODE_IP) + { + next0 = memif_next_from_ip_hdr (node, first_b0); + next1 = memif_next_from_ip_hdr (node, first_b1); + } + else if (mode == MEMIF_INTERFACE_MODE_ETHERNET) + { + if (PREDICT_FALSE (mif->per_interface_next_index != ~0)) + next0 = next1 = mif->per_interface_next_index; + else + /* redirect if feature path + * enabled */ + vnet_feature_start_device_input_x2 (mif->sw_if_index, + &next0, &next1, + first_b0, first_b1); + } + + /* trace */ + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (first_b0); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (first_b1); + + if (PREDICT_FALSE (n_trace > 0)) + { + /* b0 */ + if (PREDICT_TRUE (first_b0 != 0)) + { + memif_input_trace_t *tr; + vlib_trace_buffer (vm, node, next0, first_b0, + /* follow_chain */ 0); + vlib_set_trace_count (vm, node, --n_trace); + tr = vlib_add_trace (vm, node, first_b0, sizeof (*tr)); + tr->next_index = next0; + tr->hw_if_index = mif->hw_if_index; + tr->ring = qid; + } + if (n_trace) + { + /* b1 */ + if (PREDICT_TRUE (first_b1 != 0)) + { + memif_input_trace_t *tr; + vlib_trace_buffer (vm, node, next1, first_b1, + /* follow_chain */ 0); + vlib_set_trace_count (vm, node, --n_trace); + tr = vlib_add_trace (vm, node, first_b1, sizeof (*tr)); + tr->next_index = next1; + tr->hw_if_index = mif->hw_if_index; + tr->ring = qid; + } + } + } + + /* enqueue */ + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, to_next, + n_left_to_next, first_bi0, + first_bi1, next0, next1); + + /* next packet */ + n_rx_packets += 2; + n_rx_bytes += b0_total + b1_total; + } + while (num_slots && n_left_to_next) + { + vlib_buffer_t *first_b0 = 0; + u32 bi0 = 0, first_bi0 = 0; + b0_total = memif_copy_buffer_from_rx_ring (vm, mif, ring, mq, + ring_size, + n_buffer_bytes, + &n_free_bufs, &first_b0, + &first_bi0, &bi0, + &num_slots); + + if (mode == MEMIF_INTERFACE_MODE_IP) + { + next0 = memif_next_from_ip_hdr (node, first_b0); + } + else if (mode == MEMIF_INTERFACE_MODE_ETHERNET) + { + if (PREDICT_FALSE (mif->per_interface_next_index != ~0)) + next0 = mif->per_interface_next_index; + else + /* redirect if feature path + * enabled */ + vnet_feature_start_device_input_x1 (mif->sw_if_index, + &next0, first_b0); + } + + /* trace */ + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (first_b0); + + if (PREDICT_FALSE (n_trace > 0)) + { + if (PREDICT_TRUE (first_b0 != 0)) + { + memif_input_trace_t *tr; + vlib_trace_buffer (vm, node, next0, first_b0, + /* follow_chain */ 0); + vlib_set_trace_count (vm, node, --n_trace); + tr = vlib_add_trace (vm, node, first_b0, sizeof (*tr)); + tr->next_index = next0; + tr->hw_if_index = mif->hw_if_index; + tr->ring = qid; + } + } + + /* enqueue buffer */ + to_next[0] = first_bi0; + to_next += 1; + n_left_to_next--; + + /* enqueue */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, first_bi0, next0); + + /* next packet */ + n_rx_packets++; + n_rx_bytes += b0_total; + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + CLIB_MEMORY_STORE_BARRIER (); + ring->tail = head; + + vlib_increment_combined_counter (vnm->interface_main.combined_sw_if_counters + + VNET_INTERFACE_COUNTER_RX, thread_index, + mif->hw_if_index, n_rx_packets, + n_rx_bytes); + + return n_rx_packets; +} + +static uword +memif_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + u32 n_rx = 0; + memif_main_t *nm = &memif_main; + vnet_device_input_runtime_t *rt = (void *) node->runtime_data; + vnet_device_and_queue_t *dq; + + foreach_device_and_queue (dq, rt->devices_and_queues) + { + memif_if_t *mif; + mif = vec_elt_at_index (nm->interfaces, dq->dev_instance); + if ((mif->flags & MEMIF_IF_FLAG_ADMIN_UP) && + (mif->flags & MEMIF_IF_FLAG_CONNECTED)) + { + if (mif->flags & MEMIF_IF_FLAG_IS_SLAVE) + { + if (mif->mode == MEMIF_INTERFACE_MODE_IP) + n_rx += memif_device_input_inline (vm, node, frame, mif, + MEMIF_RING_M2S, dq->queue_id, + MEMIF_INTERFACE_MODE_IP); + else + n_rx += memif_device_input_inline (vm, node, frame, mif, + MEMIF_RING_M2S, dq->queue_id, + MEMIF_INTERFACE_MODE_ETHERNET); + } + else + { + if (mif->mode == MEMIF_INTERFACE_MODE_IP) + n_rx += memif_device_input_inline (vm, node, frame, mif, + MEMIF_RING_S2M, dq->queue_id, + MEMIF_INTERFACE_MODE_IP); + else + n_rx += memif_device_input_inline (vm, node, frame, mif, + MEMIF_RING_S2M, dq->queue_id, + MEMIF_INTERFACE_MODE_ETHERNET); + } + } + } + + return n_rx; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (memif_input_node) = { + .function = memif_input_fn, + .name = "memif-input", + .sibling_of = "device-input", + .format_trace = format_memif_input_trace, + .type = VLIB_NODE_TYPE_INPUT, + .state = VLIB_NODE_STATE_INTERRUPT, + .n_errors = MEMIF_INPUT_N_ERROR, + .error_strings = memif_input_error_strings, +}; + +VLIB_NODE_FUNCTION_MULTIARCH (memif_input_node, memif_input_fn) +/* *INDENT-ON* */ + + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/memif/private.h b/src/plugins/memif/private.h new file mode 100644 index 00000000..912ec59a --- /dev/null +++ b/src/plugins/memif/private.h @@ -0,0 +1,261 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#include <vppinfra/lock.h> + +#define MEMIF_DEFAULT_SOCKET_FILENAME "memif.sock" +#define MEMIF_DEFAULT_RING_SIZE 1024 +#define MEMIF_DEFAULT_RX_QUEUES 1 +#define MEMIF_DEFAULT_TX_QUEUES 1 +#define MEMIF_DEFAULT_BUFFER_SIZE 2048 + +#define MEMIF_MAX_M2S_RING (vec_len (vlib_mains) - 1) +#define MEMIF_MAX_S2M_RING (vec_len (vlib_mains) - 1) +#define MEMIF_MAX_REGION 255 +#define MEMIF_MAX_LOG2_RING_SIZE 14 + +#define MEMIF_DEBUG 0 + +#if MEMIF_DEBUG == 1 +#define DBG(...) clib_warning(__VA_ARGS__) +#define DBG_UNIX_LOG(...) clib_unix_warning(__VA_ARGS__) +#else +#define DBG(...) +#define DBG_UNIX_LOG(...) +#endif + +#if MEMIF_DEBUG == 1 +#define memif_file_add(a, b) do { \ + ASSERT (*a == ~0); \ + *a = clib_file_add (&file_main, b); \ + clib_warning ("clib_file_add fd %d private_data %u idx %u", \ + (b)->file_descriptor, (b)->private_data, *a); \ +} while (0) + +#define memif_file_del(a) do { \ + clib_warning ("clib_file_del idx %u",a - file_main.file_pool); \ + clib_file_del (&file_main, a); \ +} while (0) + +#define memif_file_del_by_index(a) do { \ + clib_warning ("clib_file_del idx %u", a); \ + clib_file_del_by_index (&file_main, a); \ +} while (0) +#else +#define memif_file_add(a, b) do { \ + ASSERT (*a == ~0); \ + *a = clib_file_add (&file_main, b); \ +} while (0) +#define memif_file_del(a) clib_file_del(&file_main, a) +#define memif_file_del_by_index(a) clib_file_del_by_index(&file_main, a) +#endif + +typedef struct +{ + u8 *filename; + int fd; + uword clib_file_index; + uword *pending_file_indices; + int ref_cnt; + int is_listener; + + /* hash of all registered id */ + mhash_t dev_instance_by_id; + + /* hash of all registered fds */ + uword *dev_instance_by_fd; +} memif_socket_file_t; + +typedef struct +{ + void *shm; + memif_region_size_t region_size; + int fd; +} memif_region_t; + +typedef struct +{ + memif_msg_t msg; + int fd; +} memif_msg_fifo_elt_t; + +typedef struct +{ + /* ring data */ + memif_ring_t *ring; + memif_log2_ring_size_t log2_ring_size; + memif_region_index_t region; + memif_region_offset_t offset; + + u16 last_head; + u16 last_tail; + + /* interrupts */ + int int_fd; + uword int_clib_file_index; + u64 int_count; +} memif_queue_t; + +#define foreach_memif_if_flag \ + _(0, ADMIN_UP, "admin-up") \ + _(1, IS_SLAVE, "slave") \ + _(2, CONNECTING, "connecting") \ + _(3, CONNECTED, "connected") \ + _(4, DELETING, "deleting") + +typedef enum +{ +#define _(a, b, c) MEMIF_IF_FLAG_##b = (1 << a), + foreach_memif_if_flag +#undef _ +} memif_if_flag_t; + +typedef struct +{ + CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); + clib_spinlock_t lockp; + u32 flags; + memif_interface_id_t id; + u32 hw_if_index; + u32 sw_if_index; + uword dev_instance; + memif_interface_mode_t mode:8; + + u32 per_interface_next_index; + + /* socket connection */ + uword socket_file_index; + int conn_fd; + uword conn_clib_file_index; + memif_msg_fifo_elt_t *msg_queue; + u8 *secret; + + memif_region_t *regions; + + memif_queue_t *rx_queues; + memif_queue_t *tx_queues; + + /* remote info */ + pid_t remote_pid; + uid_t remote_uid; + gid_t remote_gid; + u8 *remote_name; + u8 *remote_if_name; + + struct + { + memif_log2_ring_size_t log2_ring_size; + u8 num_s2m_rings; + u8 num_m2s_rings; + u16 buffer_size; + } cfg; + + struct + { + memif_log2_ring_size_t log2_ring_size; + u8 num_s2m_rings; + u8 num_m2s_rings; + u16 buffer_size; + } run; + + /* disconnect strings */ + u8 *local_disc_string; + u8 *remote_disc_string; +} memif_if_t; + +typedef struct +{ + CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); + + /** API message ID base */ + u16 msg_id_base; + + /* pool of all memory interfaces */ + memif_if_t *interfaces; + + /* pool of all unix socket files */ + memif_socket_file_t *socket_files; + mhash_t socket_file_index_by_filename; + + /* rx buffer cache */ + u32 **rx_buffers; + +} memif_main_t; + +extern memif_main_t memif_main; +extern vnet_device_class_t memif_device_class; +extern vlib_node_registration_t memif_input_node; + +enum +{ + MEMIF_PROCESS_EVENT_START = 1, + MEMIF_PROCESS_EVENT_STOP = 2, +} memif_process_event_t; + +typedef struct +{ + memif_interface_id_t id; + u8 *socket_filename; + u8 *secret; + u8 is_master; + memif_interface_mode_t mode:8; + memif_log2_ring_size_t log2_ring_size; + u16 buffer_size; + u8 hw_addr_set; + u8 hw_addr[6]; + u8 rx_queues; + u8 tx_queues; + + /* return */ + u32 sw_if_index; +} memif_create_if_args_t; + +int memif_create_if (vlib_main_t * vm, memif_create_if_args_t * args); +int memif_delete_if (vlib_main_t * vm, memif_if_t * mif); +clib_error_t *memif_plugin_api_hookup (vlib_main_t * vm); + +static_always_inline void * +memif_get_buffer (memif_if_t * mif, memif_ring_t * ring, u16 slot) +{ + u16 region = ring->desc[slot].region; + return mif->regions[region].shm + ring->desc[slot].offset; +} + +/* memif.c */ +clib_error_t *memif_init_regions_and_queues (memif_if_t * mif); +clib_error_t *memif_connect (memif_if_t * mif); +void memif_disconnect (memif_if_t * mif, clib_error_t * err); + +/* socket.c */ +clib_error_t *memif_conn_fd_accept_ready (clib_file_t * uf); +clib_error_t *memif_master_conn_fd_read_ready (clib_file_t * uf); +clib_error_t *memif_slave_conn_fd_read_ready (clib_file_t * uf); +clib_error_t *memif_master_conn_fd_write_ready (clib_file_t * uf); +clib_error_t *memif_slave_conn_fd_write_ready (clib_file_t * uf); +clib_error_t *memif_master_conn_fd_error (clib_file_t * uf); +clib_error_t *memif_slave_conn_fd_error (clib_file_t * uf); +clib_error_t *memif_msg_send_disconnect (memif_if_t * mif, + clib_error_t * err); +u8 *format_memif_device_name (u8 * s, va_list * args); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/memif/socket.c b/src/plugins/memif/socket.c new file mode 100644 index 00000000..1abc0f11 --- /dev/null +++ b/src/plugins/memif/socket.c @@ -0,0 +1,740 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#define _GNU_SOURCE +#include <stdint.h> +#include <net/if.h> +#include <sys/types.h> +#include <fcntl.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <sys/uio.h> +#include <sys/mman.h> +#include <sys/prctl.h> +#include <sys/eventfd.h> +#include <inttypes.h> +#include <limits.h> + +#include <vlib/vlib.h> +#include <vlib/unix/unix.h> +#include <vnet/plugin/plugin.h> +#include <vnet/ethernet/ethernet.h> +#include <vpp/app/version.h> + +#include <memif/memif.h> +#include <memif/private.h> + +static u8 * +memif_str2vec (uint8_t * str, int len) +{ + u8 *s = 0; + int i; + + if (str[0] == 0) + return s; + + for (i = 0; i < len; i++) + { + vec_add1 (s, str[i]); + if (str[i] == 0) + return s; + } + vec_add1 (s, 0); + + return s; +} + +static clib_error_t * +memif_msg_send (int fd, memif_msg_t * msg, int afd) +{ + struct msghdr mh = { 0 }; + struct iovec iov[1]; + char ctl[CMSG_SPACE (sizeof (int))]; + int rv; + + iov[0].iov_base = (void *) msg; + iov[0].iov_len = sizeof (memif_msg_t); + mh.msg_iov = iov; + mh.msg_iovlen = 1; + + if (afd > 0) + { + struct cmsghdr *cmsg; + memset (&ctl, 0, sizeof (ctl)); + mh.msg_control = ctl; + mh.msg_controllen = sizeof (ctl); + cmsg = CMSG_FIRSTHDR (&mh); + cmsg->cmsg_len = CMSG_LEN (sizeof (int)); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + memcpy (CMSG_DATA (cmsg), &afd, sizeof (int)); + } + rv = sendmsg (fd, &mh, 0); + if (rv < 0) + return clib_error_return_unix (0, "sendmsg"); + DBG ("Message type %u sent (fd %d)", msg->type, afd); + return 0; +} + +static void +memif_msg_enq_ack (memif_if_t * mif) +{ + memif_msg_fifo_elt_t *e; + clib_fifo_add2 (mif->msg_queue, e); + + e->msg.type = MEMIF_MSG_TYPE_ACK; + e->fd = -1; +} + +static clib_error_t * +memif_msg_enq_hello (int fd) +{ + u8 *s; + memif_msg_t msg = { 0 }; + memif_msg_hello_t *h = &msg.hello; + msg.type = MEMIF_MSG_TYPE_HELLO; + h->min_version = MEMIF_VERSION; + h->max_version = MEMIF_VERSION; + h->max_m2s_ring = MEMIF_MAX_M2S_RING; + h->max_s2m_ring = MEMIF_MAX_M2S_RING; + h->max_region = MEMIF_MAX_REGION; + h->max_log2_ring_size = MEMIF_MAX_LOG2_RING_SIZE; + s = format (0, "VPP %s%c", VPP_BUILD_VER, 0); + strncpy ((char *) h->name, (char *) s, sizeof (h->name) - 1); + vec_free (s); + return memif_msg_send (fd, &msg, -1); +} + +static void +memif_msg_enq_init (memif_if_t * mif) +{ + u8 *s; + memif_msg_fifo_elt_t *e; + clib_fifo_add2 (mif->msg_queue, e); + memif_msg_init_t *i = &e->msg.init; + + e->msg.type = MEMIF_MSG_TYPE_INIT; + e->fd = -1; + i->version = MEMIF_VERSION; + i->id = mif->id; + i->mode = mif->mode; + s = format (0, "VPP %s%c", VPP_BUILD_VER, 0); + strncpy ((char *) i->name, (char *) s, sizeof (i->name) - 1); + if (mif->secret) + strncpy ((char *) i->secret, (char *) mif->secret, + sizeof (i->secret) - 1); + vec_free (s); +} + +static void +memif_msg_enq_add_region (memif_if_t * mif, u8 region) +{ + memif_msg_fifo_elt_t *e; + clib_fifo_add2 (mif->msg_queue, e); + memif_msg_add_region_t *ar = &e->msg.add_region; + + e->msg.type = MEMIF_MSG_TYPE_ADD_REGION; + e->fd = mif->regions[region].fd; + ar->index = region; + ar->size = mif->regions[region].region_size; +} + +static void +memif_msg_enq_add_ring (memif_if_t * mif, u8 index, u8 direction) +{ + memif_msg_fifo_elt_t *e; + clib_fifo_add2 (mif->msg_queue, e); + memif_msg_add_ring_t *ar = &e->msg.add_ring; + memif_queue_t *mq; + + ASSERT ((mif->flags & MEMIF_IF_FLAG_IS_SLAVE) != 0); + + e->msg.type = MEMIF_MSG_TYPE_ADD_RING; + + if (direction == MEMIF_RING_M2S) + mq = vec_elt_at_index (mif->rx_queues, index); + else + mq = vec_elt_at_index (mif->tx_queues, index); + + e->fd = mq->int_fd; + ar->index = index; + ar->region = mq->region; + ar->offset = mq->offset; + ar->log2_ring_size = mq->log2_ring_size; + ar->flags = (direction == MEMIF_RING_S2M) ? MEMIF_MSG_ADD_RING_FLAG_S2M : 0; +} + +static void +memif_msg_enq_connect (memif_if_t * mif) +{ + memif_msg_fifo_elt_t *e; + clib_fifo_add2 (mif->msg_queue, e); + memif_msg_connect_t *c = &e->msg.connect; + u8 *s; + + e->msg.type = MEMIF_MSG_TYPE_CONNECT; + e->fd = -1; + s = format (0, "%U%c", format_memif_device_name, mif->dev_instance, 0); + strncpy ((char *) c->if_name, (char *) s, sizeof (c->if_name) - 1); + vec_free (s); +} + +static void +memif_msg_enq_connected (memif_if_t * mif) +{ + memif_msg_fifo_elt_t *e; + clib_fifo_add2 (mif->msg_queue, e); + memif_msg_connected_t *c = &e->msg.connected; + u8 *s; + + e->msg.type = MEMIF_MSG_TYPE_CONNECTED; + e->fd = -1; + s = format (0, "%U%c", format_memif_device_name, mif->dev_instance, 0); + strncpy ((char *) c->if_name, (char *) s, sizeof (c->if_name) - 1); + vec_free (s); +} + +clib_error_t * +memif_msg_send_disconnect (memif_if_t * mif, clib_error_t * err) +{ + memif_msg_t msg = { 0 }; + msg.type = MEMIF_MSG_TYPE_DISCONNECT; + memif_msg_disconnect_t *d = &msg.disconnect; + + d->code = err->code; + strncpy ((char *) d->string, (char *) err->what, sizeof (d->string) - 1); + + return memif_msg_send (mif->conn_fd, &msg, -1); +} + +static clib_error_t * +memif_msg_receive_hello (memif_if_t * mif, memif_msg_t * msg) +{ + memif_msg_hello_t *h = &msg->hello; + + if (msg->hello.min_version > MEMIF_VERSION || + msg->hello.max_version < MEMIF_VERSION) + return clib_error_return (0, "incompatible protocol version"); + + mif->run.num_s2m_rings = clib_min (h->max_s2m_ring + 1, + mif->cfg.num_s2m_rings); + mif->run.num_m2s_rings = clib_min (h->max_m2s_ring + 1, + mif->cfg.num_m2s_rings); + mif->run.log2_ring_size = clib_min (h->max_log2_ring_size, + mif->cfg.log2_ring_size); + mif->run.buffer_size = mif->cfg.buffer_size; + + mif->remote_name = memif_str2vec (h->name, sizeof (h->name)); + + return 0; +} + +static clib_error_t * +memif_msg_receive_init (memif_if_t ** mifp, memif_msg_t * msg, + clib_file_t * uf) +{ + memif_main_t *mm = &memif_main; + memif_socket_file_t *msf = + vec_elt_at_index (mm->socket_files, uf->private_data); + memif_msg_init_t *i = &msg->init; + memif_if_t *mif, tmp; + clib_error_t *err; + uword *p; + + if (i->version != MEMIF_VERSION) + { + memif_file_del_by_index (uf - file_main.file_pool); + return clib_error_return (0, "unsupported version"); + } + + p = mhash_get (&msf->dev_instance_by_id, &i->id); + + if (!p) + { + err = clib_error_return (0, "unmatched interface id"); + goto error; + } + + mif = vec_elt_at_index (mm->interfaces, p[0]); + + if (mif->flags & MEMIF_IF_FLAG_IS_SLAVE) + { + err = clib_error_return (0, "cannot connect to slave"); + goto error; + } + + if (mif->conn_fd != -1) + { + err = clib_error_return (0, "already connected"); + goto error; + } + + if (i->mode != mif->mode) + { + err = clib_error_return (0, "mode mismatch"); + goto error; + } + + mif->conn_fd = uf->file_descriptor; + mif->conn_clib_file_index = uf - file_main.file_pool; + hash_set (msf->dev_instance_by_fd, mif->conn_fd, mif->dev_instance); + mif->remote_name = memif_str2vec (i->name, sizeof (i->name)); + *mifp = mif; + + if (mif->secret) + { + u8 *s; + int r; + s = memif_str2vec (i->secret, sizeof (i->secret)); + if (s == 0) + return clib_error_return (0, "secret required"); + + r = vec_cmp (s, mif->secret); + vec_free (s); + + if (r) + return clib_error_return (0, "incorrect secret"); + } + + return 0; + +error: + tmp.conn_fd = uf->file_descriptor; + memif_msg_send_disconnect (&tmp, err); + memif_file_del_by_index (uf - file_main.file_pool); + return err; +} + +static clib_error_t * +memif_msg_receive_add_region (memif_if_t * mif, memif_msg_t * msg, int fd) +{ + memif_msg_add_region_t *ar = &msg->add_region; + memif_region_t *mr; + if (fd < 0) + return clib_error_return (0, "missing memory region fd"); + + if (ar->index != vec_len (mif->regions)) + return clib_error_return (0, "unexpected region index"); + + if (ar->index > MEMIF_MAX_REGION) + return clib_error_return (0, "too many regions"); + + vec_validate_aligned (mif->regions, ar->index, CLIB_CACHE_LINE_BYTES); + mr = vec_elt_at_index (mif->regions, ar->index); + mr->fd = fd; + mr->region_size = ar->size; + + return 0; +} + +static clib_error_t * +memif_msg_receive_add_ring (memif_if_t * mif, memif_msg_t * msg, int fd) +{ + memif_msg_add_ring_t *ar = &msg->add_ring; + memif_queue_t *mq; + + if (fd < 0) + return clib_error_return (0, "missing ring interrupt fd"); + + if (ar->flags & MEMIF_MSG_ADD_RING_FLAG_S2M) + { + if (ar->index != vec_len (mif->rx_queues)) + return clib_error_return (0, "unexpected ring index"); + + if (ar->index > MEMIF_MAX_S2M_RING) + return clib_error_return (0, "too many rings"); + + vec_validate_aligned (mif->rx_queues, ar->index, CLIB_CACHE_LINE_BYTES); + mq = vec_elt_at_index (mif->rx_queues, ar->index); + mif->run.num_s2m_rings = vec_len (mif->rx_queues); + } + else + { + if (ar->index != vec_len (mif->tx_queues)) + return clib_error_return (0, "unexpected ring index"); + + if (ar->index > MEMIF_MAX_M2S_RING) + return clib_error_return (0, "too many rings"); + + vec_validate_aligned (mif->tx_queues, ar->index, CLIB_CACHE_LINE_BYTES); + mq = vec_elt_at_index (mif->tx_queues, ar->index); + mif->run.num_m2s_rings = vec_len (mif->tx_queues); + } + + mq->int_fd = fd; + mq->int_clib_file_index = ~0; + mq->log2_ring_size = ar->log2_ring_size; + mq->region = ar->region; + mq->offset = ar->offset; + + return 0; +} + +static clib_error_t * +memif_msg_receive_connect (memif_if_t * mif, memif_msg_t * msg) +{ + clib_error_t *err; + memif_msg_connect_t *c = &msg->connect; + + if ((err = memif_connect (mif))) + return err; + + mif->remote_if_name = memif_str2vec (c->if_name, sizeof (c->if_name)); + + return 0; +} + +static clib_error_t * +memif_msg_receive_connected (memif_if_t * mif, memif_msg_t * msg) +{ + clib_error_t *err; + memif_msg_connected_t *c = &msg->connected; + + if ((err = memif_connect (mif))) + return err; + + mif->remote_if_name = memif_str2vec (c->if_name, sizeof (c->if_name)); + return 0; +} + +static clib_error_t * +memif_msg_receive_disconnect (memif_if_t * mif, memif_msg_t * msg) +{ + memif_msg_disconnect_t *d = &msg->disconnect; + + mif->remote_disc_string = memif_str2vec (d->string, sizeof (d->string)); + return clib_error_return (0, "disconnect received"); +} + +static clib_error_t * +memif_msg_receive (memif_if_t ** mifp, clib_file_t * uf) +{ + char ctl[CMSG_SPACE (sizeof (int)) + + CMSG_SPACE (sizeof (struct ucred))] = { 0 }; + struct msghdr mh = { 0 }; + struct iovec iov[1]; + memif_msg_t msg = { 0 }; + ssize_t size; + clib_error_t *err = 0; + int fd = -1; + int i; + memif_if_t *mif = *mifp; + + iov[0].iov_base = (void *) &msg; + iov[0].iov_len = sizeof (memif_msg_t); + mh.msg_iov = iov; + mh.msg_iovlen = 1; + mh.msg_control = ctl; + mh.msg_controllen = sizeof (ctl); + + /* receive the incoming message */ + size = recvmsg (uf->file_descriptor, &mh, 0); + if (size != sizeof (memif_msg_t)) + { + return (size == 0) ? clib_error_return (0, "disconnected") : + clib_error_return_unix (0, + "recvmsg: malformed message received on fd %d", + uf->file_descriptor); + } + + if (mif == 0 && msg.type != MEMIF_MSG_TYPE_INIT) + { + memif_file_del (uf); + return clib_error_return (0, "unexpected message received"); + } + + /* process anciliary data */ + struct ucred *cr = 0; + struct cmsghdr *cmsg; + + cmsg = CMSG_FIRSTHDR (&mh); + while (cmsg) + { + if (cmsg->cmsg_level == SOL_SOCKET) + { + if (cmsg->cmsg_type == SCM_CREDENTIALS) + { + cr = (struct ucred *) CMSG_DATA (cmsg); + } + else if (cmsg->cmsg_type == SCM_RIGHTS) + { + int *fdp = (int *) CMSG_DATA (cmsg); + fd = *fdp; + } + } + cmsg = CMSG_NXTHDR (&mh, cmsg); + } + + DBG ("Message type %u received", msg.type); + /* process the message based on its type */ + switch (msg.type) + { + case MEMIF_MSG_TYPE_ACK: + break; + + case MEMIF_MSG_TYPE_HELLO: + if ((err = memif_msg_receive_hello (mif, &msg))) + return err; + if ((err = memif_init_regions_and_queues (mif))) + return err; + memif_msg_enq_init (mif); + memif_msg_enq_add_region (mif, 0); + vec_foreach_index (i, mif->tx_queues) + memif_msg_enq_add_ring (mif, i, MEMIF_RING_S2M); + vec_foreach_index (i, mif->rx_queues) + memif_msg_enq_add_ring (mif, i, MEMIF_RING_M2S); + memif_msg_enq_connect (mif); + break; + + case MEMIF_MSG_TYPE_INIT: + if ((err = memif_msg_receive_init (mifp, &msg, uf))) + return err; + mif = *mifp; + mif->remote_pid = cr->pid; + mif->remote_uid = cr->uid; + mif->remote_gid = cr->gid; + memif_msg_enq_ack (mif); + break; + + case MEMIF_MSG_TYPE_ADD_REGION: + if ((err = memif_msg_receive_add_region (mif, &msg, fd))) + return err; + memif_msg_enq_ack (mif); + break; + + case MEMIF_MSG_TYPE_ADD_RING: + if ((err = memif_msg_receive_add_ring (mif, &msg, fd))) + return err; + memif_msg_enq_ack (mif); + break; + + case MEMIF_MSG_TYPE_CONNECT: + if ((err = memif_msg_receive_connect (mif, &msg))) + return err; + memif_msg_enq_connected (mif); + break; + + case MEMIF_MSG_TYPE_CONNECTED: + if ((err = memif_msg_receive_connected (mif, &msg))) + return err; + break; + + case MEMIF_MSG_TYPE_DISCONNECT: + if ((err = memif_msg_receive_disconnect (mif, &msg))) + return err; + break; + + default: + err = clib_error_return (0, "unknown message type (0x%x)", msg.type); + return err; + } + + if (clib_fifo_elts (mif->msg_queue) && mif->conn_clib_file_index != ~0) + clib_file_set_data_available_to_write (&file_main, + mif->conn_clib_file_index, 1); + return 0; +} + +clib_error_t * +memif_master_conn_fd_read_ready (clib_file_t * uf) +{ + memif_main_t *mm = &memif_main; + memif_socket_file_t *msf = + pool_elt_at_index (mm->socket_files, uf->private_data); + uword *p; + memif_if_t *mif = 0; + uword conn_clib_file_index = ~0; + clib_error_t *err = 0; + + p = hash_get (msf->dev_instance_by_fd, uf->file_descriptor); + if (p) + { + mif = vec_elt_at_index (mm->interfaces, p[0]); + } + else + { + /* This is new connection, remove index from pending vector */ + int i; + vec_foreach_index (i, msf->pending_file_indices) + if (msf->pending_file_indices[i] == uf - file_main.file_pool) + { + conn_clib_file_index = msf->pending_file_indices[i]; + vec_del1 (msf->pending_file_indices, i); + break; + } + ASSERT (conn_clib_file_index != ~0); + } + err = memif_msg_receive (&mif, uf); + if (err) + { + memif_disconnect (mif, err); + clib_error_free (err); + } + return 0; +} + +clib_error_t * +memif_slave_conn_fd_read_ready (clib_file_t * uf) +{ + memif_main_t *mm = &memif_main; + clib_error_t *err; + memif_if_t *mif = vec_elt_at_index (mm->interfaces, uf->private_data); + err = memif_msg_receive (&mif, uf); + if (err) + { + memif_disconnect (mif, err); + clib_error_free (err); + } + return 0; +} + +static clib_error_t * +memif_conn_fd_write_ready (clib_file_t * uf, memif_if_t * mif) +{ + memif_msg_fifo_elt_t *e; + clib_fifo_sub2 (mif->msg_queue, e); + clib_file_set_data_available_to_write (&file_main, + mif->conn_clib_file_index, 0); + memif_msg_send (mif->conn_fd, &e->msg, e->fd); + return 0; +} + +clib_error_t * +memif_master_conn_fd_write_ready (clib_file_t * uf) +{ + memif_main_t *mm = &memif_main; + memif_socket_file_t *msf = + pool_elt_at_index (mm->socket_files, uf->private_data); + uword *p; + memif_if_t *mif; + + p = hash_get (msf->dev_instance_by_fd, uf->file_descriptor); + if (!p) + return 0; + + mif = vec_elt_at_index (mm->interfaces, p[0]); + return memif_conn_fd_write_ready (uf, mif); +} + +clib_error_t * +memif_slave_conn_fd_write_ready (clib_file_t * uf) +{ + memif_main_t *mm = &memif_main; + memif_if_t *mif = vec_elt_at_index (mm->interfaces, uf->private_data); + return memif_conn_fd_write_ready (uf, mif); +} + +clib_error_t * +memif_slave_conn_fd_error (clib_file_t * uf) +{ + memif_main_t *mm = &memif_main; + memif_if_t *mif = vec_elt_at_index (mm->interfaces, uf->private_data); + clib_error_t *err; + + err = clib_error_return (0, "connection fd error"); + memif_disconnect (mif, err); + clib_error_free (err); + + return 0; +} + +clib_error_t * +memif_master_conn_fd_error (clib_file_t * uf) +{ + memif_main_t *mm = &memif_main; + memif_socket_file_t *msf = + pool_elt_at_index (mm->socket_files, uf->private_data); + uword *p; + + + p = hash_get (msf->dev_instance_by_fd, uf->file_descriptor); + if (p) + { + memif_if_t *mif; + clib_error_t *err; + mif = vec_elt_at_index (mm->interfaces, p[0]); + err = clib_error_return (0, "connection fd error"); + memif_disconnect (mif, err); + clib_error_free (err); + } + else + { + int i; + vec_foreach_index (i, msf->pending_file_indices) + if (msf->pending_file_indices[i] == uf - file_main.file_pool) + { + vec_del1 (msf->pending_file_indices, i); + memif_file_del (uf); + return 0; + } + } + + clib_warning ("Error on unknown file descriptor %d", uf->file_descriptor); + memif_file_del (uf); + return 0; +} + + +clib_error_t * +memif_conn_fd_accept_ready (clib_file_t * uf) +{ + memif_main_t *mm = &memif_main; + memif_socket_file_t *msf = + pool_elt_at_index (mm->socket_files, uf->private_data); + int addr_len; + struct sockaddr_un client; + int conn_fd; + clib_file_t template = { 0 }; + uword clib_file_index = ~0; + clib_error_t *err; + + + addr_len = sizeof (client); + conn_fd = accept (uf->file_descriptor, + (struct sockaddr *) &client, (socklen_t *) & addr_len); + + if (conn_fd < 0) + return clib_error_return_unix (0, "accept fd %d", uf->file_descriptor); + + template.read_function = memif_master_conn_fd_read_ready; + template.write_function = memif_master_conn_fd_write_ready; + template.error_function = memif_master_conn_fd_error; + template.file_descriptor = conn_fd; + template.private_data = uf->private_data; + + memif_file_add (&clib_file_index, &template); + + err = memif_msg_enq_hello (conn_fd); + if (err) + { + clib_error_report (err); + memif_file_del_by_index (clib_file_index); + } + else + vec_add1 (msf->pending_file_indices, clib_file_index); + + return 0; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/nat.am b/src/plugins/nat.am new file mode 100644 index 00000000..b967a716 --- /dev/null +++ b/src/plugins/nat.am @@ -0,0 +1,41 @@ + +# Copyright (c) <current-year> <your-organization> +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +vppapitestplugins_LTLIBRARIES += nat_test_plugin.la +vppplugins_LTLIBRARIES += nat_plugin.la + +nat_plugin_la_SOURCES = nat/nat.c \ + nat/nat_api.c \ + nat/in2out.c \ + nat/out2in.c \ + nat/nat_plugin.api.h \ + nat/nat_ipfix_logging.c \ + nat/nat_det.c \ + nat/nat64.c \ + nat/nat64_cli.c \ + nat/nat64_in2out.c \ + nat/nat64_out2in.c \ + nat/nat64_db.c + +API_FILES += nat/nat.api + +nobase_apiinclude_HEADERS += \ + nat/nat_all_api_h.h \ + nat/nat_msg_enum.h \ + nat/nat.api.h + +nat_test_plugin_la_SOURCES = \ + nat/nat_test.c nat/nat_plugin.api.h + +# vi:syntax=automake diff --git a/src/plugins/nat/in2out.c b/src/plugins/nat/in2out.c new file mode 100755 index 00000000..8e583313 --- /dev/null +++ b/src/plugins/nat/in2out.c @@ -0,0 +1,3683 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vlib/vlib.h> +#include <vnet/vnet.h> +#include <vnet/pg/pg.h> +#include <vnet/handoff.h> + +#include <vnet/ip/ip.h> +#include <vnet/ethernet/ethernet.h> +#include <vnet/fib/ip4_fib.h> +#include <nat/nat.h> +#include <nat/nat_ipfix_logging.h> +#include <nat/nat_det.h> + +#include <vppinfra/hash.h> +#include <vppinfra/error.h> +#include <vppinfra/elog.h> + +typedef struct { + u32 sw_if_index; + u32 next_index; + u32 session_index; + u32 is_slow_path; +} snat_in2out_trace_t; + +typedef struct { + u32 next_worker_index; + u8 do_handoff; +} snat_in2out_worker_handoff_trace_t; + +/* packet trace format function */ +static u8 * format_snat_in2out_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + snat_in2out_trace_t * t = va_arg (*args, snat_in2out_trace_t *); + char * tag; + + tag = t->is_slow_path ? "NAT44_IN2OUT_SLOW_PATH" : "NAT44_IN2OUT_FAST_PATH"; + + s = format (s, "%s: sw_if_index %d, next index %d, session %d", tag, + t->sw_if_index, t->next_index, t->session_index); + + return s; +} + +static u8 * format_snat_in2out_fast_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + snat_in2out_trace_t * t = va_arg (*args, snat_in2out_trace_t *); + + s = format (s, "NAT44_IN2OUT_FAST: sw_if_index %d, next index %d", + t->sw_if_index, t->next_index); + + return s; +} + +static u8 * format_snat_in2out_worker_handoff_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + snat_in2out_worker_handoff_trace_t * t = + va_arg (*args, snat_in2out_worker_handoff_trace_t *); + char * m; + + m = t->do_handoff ? "next worker" : "same worker"; + s = format (s, "NAT44_IN2OUT_WORKER_HANDOFF: %s %d", m, t->next_worker_index); + + return s; +} + +vlib_node_registration_t snat_in2out_node; +vlib_node_registration_t snat_in2out_slowpath_node; +vlib_node_registration_t snat_in2out_fast_node; +vlib_node_registration_t snat_in2out_worker_handoff_node; +vlib_node_registration_t snat_det_in2out_node; +vlib_node_registration_t snat_in2out_output_node; +vlib_node_registration_t snat_in2out_output_slowpath_node; +vlib_node_registration_t snat_in2out_output_worker_handoff_node; +vlib_node_registration_t snat_hairpin_dst_node; +vlib_node_registration_t snat_hairpin_src_node; + + +#define foreach_snat_in2out_error \ +_(UNSUPPORTED_PROTOCOL, "Unsupported protocol") \ +_(IN2OUT_PACKETS, "Good in2out packets processed") \ +_(OUT_OF_PORTS, "Out of ports") \ +_(BAD_OUTSIDE_FIB, "Outside VRF ID not found") \ +_(BAD_ICMP_TYPE, "unsupported ICMP type") \ +_(NO_TRANSLATION, "No translation") \ +_(MAX_SESSIONS_EXCEEDED, "Maximum sessions exceeded") + +typedef enum { +#define _(sym,str) SNAT_IN2OUT_ERROR_##sym, + foreach_snat_in2out_error +#undef _ + SNAT_IN2OUT_N_ERROR, +} snat_in2out_error_t; + +static char * snat_in2out_error_strings[] = { +#define _(sym,string) string, + foreach_snat_in2out_error +#undef _ +}; + +typedef enum { + SNAT_IN2OUT_NEXT_LOOKUP, + SNAT_IN2OUT_NEXT_DROP, + SNAT_IN2OUT_NEXT_ICMP_ERROR, + SNAT_IN2OUT_NEXT_SLOW_PATH, + SNAT_IN2OUT_N_NEXT, +} snat_in2out_next_t; + +typedef enum { + SNAT_HAIRPIN_SRC_NEXT_DROP, + SNAT_HAIRPIN_SRC_NEXT_SNAT_IN2OUT, + SNAT_HAIRPIN_SRC_NEXT_SNAT_IN2OUT_WH, + SNAT_HAIRPIN_SRC_NEXT_INTERFACE_OUTPUT, + SNAT_HAIRPIN_SRC_N_NEXT, +} snat_hairpin_next_t; + +/** + * @brief Check if packet should be translated + * + * Packets aimed at outside interface and external addresss with active session + * should be translated. + * + * @param sm NAT main + * @param rt NAT runtime data + * @param sw_if_index0 index of the inside interface + * @param ip0 IPv4 header + * @param proto0 NAT protocol + * @param rx_fib_index0 RX FIB index + * + * @returns 0 if packet should be translated otherwise 1 + */ +static inline int +snat_not_translate_fast (snat_main_t * sm, vlib_node_runtime_t *node, + u32 sw_if_index0, ip4_header_t * ip0, u32 proto0, + u32 rx_fib_index0) +{ + fib_node_index_t fei = FIB_NODE_INDEX_INVALID; + fib_prefix_t pfx = { + .fp_proto = FIB_PROTOCOL_IP4, + .fp_len = 32, + .fp_addr = { + .ip4.as_u32 = ip0->dst_address.as_u32, + }, + }; + + /* Don't NAT packet aimed at the intfc address */ + if (PREDICT_FALSE(is_interface_addr(sm, node, sw_if_index0, + ip0->dst_address.as_u32))) + return 1; + + fei = fib_table_lookup (rx_fib_index0, &pfx); + if (FIB_NODE_INDEX_INVALID != fei) + { + u32 sw_if_index = fib_entry_get_resolving_interface (fei); + if (sw_if_index == ~0) + { + fei = fib_table_lookup (sm->outside_fib_index, &pfx); + if (FIB_NODE_INDEX_INVALID != fei) + sw_if_index = fib_entry_get_resolving_interface (fei); + } + snat_interface_t *i; + pool_foreach (i, sm->interfaces, + ({ + /* NAT packet aimed at outside interface */ + if ((i->is_inside == 0) && (sw_if_index == i->sw_if_index)) + return 0; + })); + } + + return 1; +} + +static inline int +snat_not_translate (snat_main_t * sm, vlib_node_runtime_t *node, + u32 sw_if_index0, ip4_header_t * ip0, u32 proto0, + u32 rx_fib_index0, u32 thread_index) +{ + udp_header_t * udp0 = ip4_next_header (ip0); + snat_session_key_t key0, sm0; + clib_bihash_kv_8_8_t kv0, value0; + + key0.addr = ip0->dst_address; + key0.port = udp0->dst_port; + key0.protocol = proto0; + key0.fib_index = sm->outside_fib_index; + kv0.key = key0.as_u64; + + /* NAT packet aimed at external address if */ + /* has active sessions */ + if (clib_bihash_search_8_8 (&sm->per_thread_data[thread_index].out2in, &kv0, + &value0)) + { + /* or is static mappings */ + if (!snat_static_mapping_match(sm, key0, &sm0, 1, 0)) + return 0; + } + else + return 0; + + return snat_not_translate_fast(sm, node, sw_if_index0, ip0, proto0, + rx_fib_index0); +} + +static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0, + ip4_header_t * ip0, + u32 rx_fib_index0, + snat_session_key_t * key0, + snat_session_t ** sessionp, + vlib_node_runtime_t * node, + u32 next0, + u32 thread_index) +{ + snat_user_t *u; + snat_user_key_t user_key; + snat_session_t *s; + clib_bihash_kv_8_8_t kv0, value0; + u32 oldest_per_user_translation_list_index; + dlist_elt_t * oldest_per_user_translation_list_elt; + dlist_elt_t * per_user_translation_list_elt; + dlist_elt_t * per_user_list_head_elt; + u32 session_index; + snat_session_key_t key1; + u32 address_index = ~0; + u32 outside_fib_index; + uword * p; + + if (PREDICT_FALSE (maximum_sessions_exceeded(sm, thread_index))) + { + b0->error = node->errors[SNAT_IN2OUT_ERROR_MAX_SESSIONS_EXCEEDED]; + return SNAT_IN2OUT_NEXT_DROP; + } + + p = hash_get (sm->ip4_main->fib_index_by_table_id, sm->outside_vrf_id); + if (! p) + { + b0->error = node->errors[SNAT_IN2OUT_ERROR_BAD_OUTSIDE_FIB]; + return SNAT_IN2OUT_NEXT_DROP; + } + outside_fib_index = p[0]; + + key1.protocol = key0->protocol; + user_key.addr = ip0->src_address; + user_key.fib_index = rx_fib_index0; + kv0.key = user_key.as_u64; + + /* Ever heard of the "user" = src ip4 address before? */ + if (clib_bihash_search_8_8 (&sm->per_thread_data[thread_index].user_hash, + &kv0, &value0)) + { + /* no, make a new one */ + pool_get (sm->per_thread_data[thread_index].users, u); + memset (u, 0, sizeof (*u)); + u->addr = ip0->src_address; + u->fib_index = rx_fib_index0; + + pool_get (sm->per_thread_data[thread_index].list_pool, per_user_list_head_elt); + + u->sessions_per_user_list_head_index = per_user_list_head_elt - + sm->per_thread_data[thread_index].list_pool; + + clib_dlist_init (sm->per_thread_data[thread_index].list_pool, + u->sessions_per_user_list_head_index); + + kv0.value = u - sm->per_thread_data[thread_index].users; + + /* add user */ + clib_bihash_add_del_8_8 (&sm->per_thread_data[thread_index].user_hash, + &kv0, 1 /* is_add */); + } + else + { + u = pool_elt_at_index (sm->per_thread_data[thread_index].users, + value0.value); + } + + /* Over quota? Recycle the least recently used dynamic translation */ + if (u->nsessions >= sm->max_translations_per_user) + { + /* Remove the oldest dynamic translation */ + do { + oldest_per_user_translation_list_index = + clib_dlist_remove_head (sm->per_thread_data[thread_index].list_pool, + u->sessions_per_user_list_head_index); + + ASSERT (oldest_per_user_translation_list_index != ~0); + + /* add it back to the end of the LRU list */ + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, + u->sessions_per_user_list_head_index, + oldest_per_user_translation_list_index); + /* Get the list element */ + oldest_per_user_translation_list_elt = + pool_elt_at_index (sm->per_thread_data[thread_index].list_pool, + oldest_per_user_translation_list_index); + + /* Get the session index from the list element */ + session_index = oldest_per_user_translation_list_elt->value; + + /* Get the session */ + s = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, + session_index); + } while (snat_is_session_static (s)); + + if (snat_is_unk_proto_session (s)) + { + clib_bihash_kv_16_8_t up_kv; + nat_ed_ses_key_t key; + + /* Remove from lookup tables */ + key.l_addr = s->in2out.addr; + key.r_addr = s->ext_host_addr; + key.fib_index = s->in2out.fib_index; + key.proto = s->in2out.port; + key.rsvd = 0; + key.l_port = 0; + up_kv.key[0] = key.as_u64[0]; + up_kv.key[1] = key.as_u64[1]; + if (clib_bihash_add_del_16_8 (&sm->in2out_ed, &up_kv, 0)) + clib_warning ("in2out key del failed"); + + key.l_addr = s->out2in.addr; + key.fib_index = s->out2in.fib_index; + up_kv.key[0] = key.as_u64[0]; + up_kv.key[1] = key.as_u64[1]; + if (clib_bihash_add_del_16_8 (&sm->out2in_ed, &up_kv, 0)) + clib_warning ("out2in key del failed"); + } + else + { + /* Remove in2out, out2in keys */ + kv0.key = s->in2out.as_u64; + if (clib_bihash_add_del_8_8 (&sm->per_thread_data[thread_index].in2out, + &kv0, 0 /* is_add */)) + clib_warning ("in2out key delete failed"); + kv0.key = s->out2in.as_u64; + if (clib_bihash_add_del_8_8 (&sm->per_thread_data[thread_index].out2in, + &kv0, 0 /* is_add */)) + clib_warning ("out2in key delete failed"); + + /* log NAT event */ + snat_ipfix_logging_nat44_ses_delete(s->in2out.addr.as_u32, + s->out2in.addr.as_u32, + s->in2out.protocol, + s->in2out.port, + s->out2in.port, + s->in2out.fib_index); + + snat_free_outside_address_and_port + (sm, thread_index, &s->out2in, s->outside_address_index); + } + s->outside_address_index = ~0; + + if (snat_alloc_outside_address_and_port (sm, rx_fib_index0, thread_index, + &key1, &address_index)) + { + ASSERT(0); + + b0->error = node->errors[SNAT_IN2OUT_ERROR_OUT_OF_PORTS]; + return SNAT_IN2OUT_NEXT_DROP; + } + s->outside_address_index = address_index; + } + else + { + u8 static_mapping = 1; + + /* First try to match static mapping by local address and port */ + if (snat_static_mapping_match (sm, *key0, &key1, 0, 0)) + { + static_mapping = 0; + /* Try to create dynamic translation */ + if (snat_alloc_outside_address_and_port (sm, rx_fib_index0, + thread_index, &key1, + &address_index)) + { + b0->error = node->errors[SNAT_IN2OUT_ERROR_OUT_OF_PORTS]; + return SNAT_IN2OUT_NEXT_DROP; + } + } + + /* Create a new session */ + pool_get (sm->per_thread_data[thread_index].sessions, s); + memset (s, 0, sizeof (*s)); + + s->outside_address_index = address_index; + + if (static_mapping) + { + u->nstaticsessions++; + s->flags |= SNAT_SESSION_FLAG_STATIC_MAPPING; + } + else + { + u->nsessions++; + } + + /* Create list elts */ + pool_get (sm->per_thread_data[thread_index].list_pool, + per_user_translation_list_elt); + clib_dlist_init (sm->per_thread_data[thread_index].list_pool, + per_user_translation_list_elt - + sm->per_thread_data[thread_index].list_pool); + + per_user_translation_list_elt->value = + s - sm->per_thread_data[thread_index].sessions; + s->per_user_index = per_user_translation_list_elt - + sm->per_thread_data[thread_index].list_pool; + s->per_user_list_head_index = u->sessions_per_user_list_head_index; + + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, + s->per_user_list_head_index, + per_user_translation_list_elt - + sm->per_thread_data[thread_index].list_pool); + } + + s->in2out = *key0; + s->out2in = key1; + s->out2in.protocol = key0->protocol; + s->out2in.fib_index = outside_fib_index; + s->ext_host_addr.as_u32 = ip0->dst_address.as_u32; + *sessionp = s; + + /* Add to translation hashes */ + kv0.key = s->in2out.as_u64; + kv0.value = s - sm->per_thread_data[thread_index].sessions; + if (clib_bihash_add_del_8_8 (&sm->per_thread_data[thread_index].in2out, &kv0, + 1 /* is_add */)) + clib_warning ("in2out key add failed"); + + kv0.key = s->out2in.as_u64; + kv0.value = s - sm->per_thread_data[thread_index].sessions; + + if (clib_bihash_add_del_8_8 (&sm->per_thread_data[thread_index].out2in, &kv0, + 1 /* is_add */)) + clib_warning ("out2in key add failed"); + + /* log NAT event */ + snat_ipfix_logging_nat44_ses_create(s->in2out.addr.as_u32, + s->out2in.addr.as_u32, + s->in2out.protocol, + s->in2out.port, + s->out2in.port, + s->in2out.fib_index); + return next0; +} + +static_always_inline +snat_in2out_error_t icmp_get_key(ip4_header_t *ip0, + snat_session_key_t *p_key0) +{ + icmp46_header_t *icmp0; + snat_session_key_t key0; + icmp_echo_header_t *echo0, *inner_echo0 = 0; + ip4_header_t *inner_ip0 = 0; + void *l4_header = 0; + icmp46_header_t *inner_icmp0; + + icmp0 = (icmp46_header_t *) ip4_next_header (ip0); + echo0 = (icmp_echo_header_t *)(icmp0+1); + + if (!icmp_is_error_message (icmp0)) + { + key0.protocol = SNAT_PROTOCOL_ICMP; + key0.addr = ip0->src_address; + key0.port = echo0->identifier; + } + else + { + inner_ip0 = (ip4_header_t *)(echo0+1); + l4_header = ip4_next_header (inner_ip0); + key0.protocol = ip_proto_to_snat_proto (inner_ip0->protocol); + key0.addr = inner_ip0->dst_address; + switch (key0.protocol) + { + case SNAT_PROTOCOL_ICMP: + inner_icmp0 = (icmp46_header_t*)l4_header; + inner_echo0 = (icmp_echo_header_t *)(inner_icmp0+1); + key0.port = inner_echo0->identifier; + break; + case SNAT_PROTOCOL_UDP: + case SNAT_PROTOCOL_TCP: + key0.port = ((tcp_udp_header_t*)l4_header)->dst_port; + break; + default: + return SNAT_IN2OUT_ERROR_UNSUPPORTED_PROTOCOL; + } + } + *p_key0 = key0; + return -1; /* success */ +} + +/** + * Get address and port values to be used for ICMP packet translation + * and create session if needed + * + * @param[in,out] sm NAT main + * @param[in,out] node NAT node runtime + * @param[in] thread_index thread index + * @param[in,out] b0 buffer containing packet to be translated + * @param[out] p_proto protocol used for matching + * @param[out] p_value address and port after NAT translation + * @param[out] p_dont_translate if packet should not be translated + * @param d optional parameter + * @param e optional parameter + */ +u32 icmp_match_in2out_slow(snat_main_t *sm, vlib_node_runtime_t *node, + u32 thread_index, vlib_buffer_t *b0, + ip4_header_t *ip0, u8 *p_proto, + snat_session_key_t *p_value, + u8 *p_dont_translate, void *d, void *e) +{ + icmp46_header_t *icmp0; + u32 sw_if_index0; + u32 rx_fib_index0; + snat_session_key_t key0; + snat_session_t *s0 = 0; + u8 dont_translate = 0; + clib_bihash_kv_8_8_t kv0, value0; + u32 next0 = ~0; + int err; + + icmp0 = (icmp46_header_t *) ip4_next_header (ip0); + sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX]; + rx_fib_index0 = ip4_fib_table_get_index_for_sw_if_index (sw_if_index0); + + err = icmp_get_key (ip0, &key0); + if (err != -1) + { + b0->error = node->errors[err]; + next0 = SNAT_IN2OUT_NEXT_DROP; + goto out; + } + key0.fib_index = rx_fib_index0; + + kv0.key = key0.as_u64; + + if (clib_bihash_search_8_8 (&sm->per_thread_data[thread_index].in2out, &kv0, + &value0)) + { + if (PREDICT_FALSE(snat_not_translate(sm, node, sw_if_index0, ip0, + IP_PROTOCOL_ICMP, rx_fib_index0, thread_index) && + vnet_buffer(b0)->sw_if_index[VLIB_TX] == ~0)) + { + dont_translate = 1; + goto out; + } + + if (PREDICT_FALSE(icmp_is_error_message (icmp0))) + { + b0->error = node->errors[SNAT_IN2OUT_ERROR_BAD_ICMP_TYPE]; + next0 = SNAT_IN2OUT_NEXT_DROP; + goto out; + } + + next0 = slow_path (sm, b0, ip0, rx_fib_index0, &key0, + &s0, node, next0, thread_index); + + if (PREDICT_FALSE (next0 == SNAT_IN2OUT_NEXT_DROP)) + goto out; + } + else + { + if (PREDICT_FALSE(icmp0->type != ICMP4_echo_request && + icmp0->type != ICMP4_echo_reply && + !icmp_is_error_message (icmp0))) + { + b0->error = node->errors[SNAT_IN2OUT_ERROR_BAD_ICMP_TYPE]; + next0 = SNAT_IN2OUT_NEXT_DROP; + goto out; + } + + s0 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, + value0.value); + } + +out: + *p_proto = key0.protocol; + if (s0) + *p_value = s0->out2in; + *p_dont_translate = dont_translate; + if (d) + *(snat_session_t**)d = s0; + return next0; +} + +/** + * Get address and port values to be used for ICMP packet translation + * + * @param[in] sm NAT main + * @param[in,out] node NAT node runtime + * @param[in] thread_index thread index + * @param[in,out] b0 buffer containing packet to be translated + * @param[out] p_proto protocol used for matching + * @param[out] p_value address and port after NAT translation + * @param[out] p_dont_translate if packet should not be translated + * @param d optional parameter + * @param e optional parameter + */ +u32 icmp_match_in2out_fast(snat_main_t *sm, vlib_node_runtime_t *node, + u32 thread_index, vlib_buffer_t *b0, + ip4_header_t *ip0, u8 *p_proto, + snat_session_key_t *p_value, + u8 *p_dont_translate, void *d, void *e) +{ + icmp46_header_t *icmp0; + u32 sw_if_index0; + u32 rx_fib_index0; + snat_session_key_t key0; + snat_session_key_t sm0; + u8 dont_translate = 0; + u8 is_addr_only; + u32 next0 = ~0; + int err; + + icmp0 = (icmp46_header_t *) ip4_next_header (ip0); + sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX]; + rx_fib_index0 = ip4_fib_table_get_index_for_sw_if_index (sw_if_index0); + + err = icmp_get_key (ip0, &key0); + if (err != -1) + { + b0->error = node->errors[err]; + next0 = SNAT_IN2OUT_NEXT_DROP; + goto out2; + } + key0.fib_index = rx_fib_index0; + + if (snat_static_mapping_match(sm, key0, &sm0, 0, &is_addr_only)) + { + if (PREDICT_FALSE(snat_not_translate_fast(sm, node, sw_if_index0, ip0, + IP_PROTOCOL_ICMP, rx_fib_index0))) + { + dont_translate = 1; + goto out; + } + + if (icmp_is_error_message (icmp0)) + { + next0 = SNAT_IN2OUT_NEXT_DROP; + goto out; + } + + b0->error = node->errors[SNAT_IN2OUT_ERROR_NO_TRANSLATION]; + next0 = SNAT_IN2OUT_NEXT_DROP; + goto out; + } + + if (PREDICT_FALSE(icmp0->type != ICMP4_echo_request && + (icmp0->type != ICMP4_echo_reply || !is_addr_only) && + !icmp_is_error_message (icmp0))) + { + b0->error = node->errors[SNAT_IN2OUT_ERROR_BAD_ICMP_TYPE]; + next0 = SNAT_IN2OUT_NEXT_DROP; + goto out; + } + +out: + *p_value = sm0; +out2: + *p_proto = key0.protocol; + *p_dont_translate = dont_translate; + return next0; +} + +static inline u32 icmp_in2out (snat_main_t *sm, + vlib_buffer_t * b0, + ip4_header_t * ip0, + icmp46_header_t * icmp0, + u32 sw_if_index0, + u32 rx_fib_index0, + vlib_node_runtime_t * node, + u32 next0, + u32 thread_index, + void *d, + void *e) +{ + snat_session_key_t sm0; + u8 protocol; + icmp_echo_header_t *echo0, *inner_echo0 = 0; + ip4_header_t *inner_ip0; + void *l4_header = 0; + icmp46_header_t *inner_icmp0; + u8 dont_translate; + u32 new_addr0, old_addr0; + u16 old_id0, new_id0; + ip_csum_t sum0; + u16 checksum0; + u32 next0_tmp; + + echo0 = (icmp_echo_header_t *)(icmp0+1); + + next0_tmp = sm->icmp_match_in2out_cb(sm, node, thread_index, b0, ip0, + &protocol, &sm0, &dont_translate, d, e); + if (next0_tmp != ~0) + next0 = next0_tmp; + if (next0 == SNAT_IN2OUT_NEXT_DROP || dont_translate) + goto out; + + sum0 = ip_incremental_checksum (0, icmp0, + ntohs(ip0->length) - ip4_header_bytes (ip0)); + checksum0 = ~ip_csum_fold (sum0); + if (PREDICT_FALSE(checksum0 != 0 && checksum0 != 0xffff)) + { + next0 = SNAT_IN2OUT_NEXT_DROP; + goto out; + } + + old_addr0 = ip0->src_address.as_u32; + new_addr0 = ip0->src_address.as_u32 = sm0.addr.as_u32; + if (vnet_buffer(b0)->sw_if_index[VLIB_TX] == ~0) + vnet_buffer(b0)->sw_if_index[VLIB_TX] = sm0.fib_index; + + sum0 = ip0->checksum; + sum0 = ip_csum_update (sum0, old_addr0, new_addr0, ip4_header_t, + src_address /* changed member */); + ip0->checksum = ip_csum_fold (sum0); + + if (!icmp_is_error_message (icmp0)) + { + new_id0 = sm0.port; + if (PREDICT_FALSE(new_id0 != echo0->identifier)) + { + old_id0 = echo0->identifier; + new_id0 = sm0.port; + echo0->identifier = new_id0; + + sum0 = icmp0->checksum; + sum0 = ip_csum_update (sum0, old_id0, new_id0, icmp_echo_header_t, + identifier); + icmp0->checksum = ip_csum_fold (sum0); + } + } + else + { + inner_ip0 = (ip4_header_t *)(echo0+1); + l4_header = ip4_next_header (inner_ip0); + + if (!ip4_header_checksum_is_valid (inner_ip0)) + { + next0 = SNAT_IN2OUT_NEXT_DROP; + goto out; + } + + old_addr0 = inner_ip0->dst_address.as_u32; + inner_ip0->dst_address = sm0.addr; + new_addr0 = inner_ip0->dst_address.as_u32; + + sum0 = icmp0->checksum; + sum0 = ip_csum_update (sum0, old_addr0, new_addr0, ip4_header_t, + dst_address /* changed member */); + icmp0->checksum = ip_csum_fold (sum0); + + switch (protocol) + { + case SNAT_PROTOCOL_ICMP: + inner_icmp0 = (icmp46_header_t*)l4_header; + inner_echo0 = (icmp_echo_header_t *)(inner_icmp0+1); + + old_id0 = inner_echo0->identifier; + new_id0 = sm0.port; + inner_echo0->identifier = new_id0; + + sum0 = icmp0->checksum; + sum0 = ip_csum_update (sum0, old_id0, new_id0, icmp_echo_header_t, + identifier); + icmp0->checksum = ip_csum_fold (sum0); + break; + case SNAT_PROTOCOL_UDP: + case SNAT_PROTOCOL_TCP: + old_id0 = ((tcp_udp_header_t*)l4_header)->dst_port; + new_id0 = sm0.port; + ((tcp_udp_header_t*)l4_header)->dst_port = new_id0; + + sum0 = icmp0->checksum; + sum0 = ip_csum_update (sum0, old_id0, new_id0, tcp_udp_header_t, + dst_port); + icmp0->checksum = ip_csum_fold (sum0); + break; + default: + ASSERT(0); + } + } + +out: + return next0; +} + +/** + * @brief Hairpinning + * + * Hairpinning allows two endpoints on the internal side of the NAT to + * communicate even if they only use each other's external IP addresses + * and ports. + * + * @param sm NAT main. + * @param b0 Vlib buffer. + * @param ip0 IP header. + * @param udp0 UDP header. + * @param tcp0 TCP header. + * @param proto0 NAT protocol. + */ +static inline void +snat_hairpinning (snat_main_t *sm, + vlib_buffer_t * b0, + ip4_header_t * ip0, + udp_header_t * udp0, + tcp_header_t * tcp0, + u32 proto0) +{ + snat_session_key_t key0, sm0; + snat_session_t * s0; + clib_bihash_kv_8_8_t kv0, value0; + ip_csum_t sum0; + u32 new_dst_addr0 = 0, old_dst_addr0, ti = 0, si; + u16 new_dst_port0, old_dst_port0; + + key0.addr = ip0->dst_address; + key0.port = udp0->dst_port; + key0.protocol = proto0; + key0.fib_index = sm->outside_fib_index; + kv0.key = key0.as_u64; + + /* Check if destination is static mappings */ + if (!snat_static_mapping_match(sm, key0, &sm0, 1, 0)) + { + new_dst_addr0 = sm0.addr.as_u32; + new_dst_port0 = sm0.port; + vnet_buffer(b0)->sw_if_index[VLIB_TX] = sm0.fib_index; + } + /* or active session */ + else + { + if (sm->num_workers > 1) + ti = (clib_net_to_host_u16 (udp0->dst_port) - 1024) / sm->port_per_thread; + else + ti = sm->num_workers; + + if (!clib_bihash_search_8_8 (&sm->per_thread_data[ti].out2in, &kv0, &value0)) + { + si = value0.value; + + s0 = pool_elt_at_index (sm->per_thread_data[ti].sessions, si); + new_dst_addr0 = s0->in2out.addr.as_u32; + new_dst_port0 = s0->in2out.port; + vnet_buffer(b0)->sw_if_index[VLIB_TX] = s0->in2out.fib_index; + } + } + + /* Destination is behind the same NAT, use internal address and port */ + if (new_dst_addr0) + { + old_dst_addr0 = ip0->dst_address.as_u32; + ip0->dst_address.as_u32 = new_dst_addr0; + sum0 = ip0->checksum; + sum0 = ip_csum_update (sum0, old_dst_addr0, new_dst_addr0, + ip4_header_t, dst_address); + ip0->checksum = ip_csum_fold (sum0); + + old_dst_port0 = tcp0->dst; + if (PREDICT_TRUE(new_dst_port0 != old_dst_port0)) + { + if (PREDICT_TRUE(proto0 == SNAT_PROTOCOL_TCP)) + { + tcp0->dst = new_dst_port0; + sum0 = tcp0->checksum; + sum0 = ip_csum_update (sum0, old_dst_addr0, new_dst_addr0, + ip4_header_t, dst_address); + sum0 = ip_csum_update (sum0, old_dst_port0, new_dst_port0, + ip4_header_t /* cheat */, length); + tcp0->checksum = ip_csum_fold(sum0); + } + else + { + udp0->dst_port = new_dst_port0; + udp0->checksum = 0; + } + } + else + { + if (PREDICT_TRUE(proto0 == SNAT_PROTOCOL_TCP)) + { + sum0 = tcp0->checksum; + sum0 = ip_csum_update (sum0, old_dst_addr0, new_dst_addr0, + ip4_header_t, dst_address); + tcp0->checksum = ip_csum_fold(sum0); + } + } + } +} + +static inline void +snat_icmp_hairpinning (snat_main_t *sm, + vlib_buffer_t * b0, + ip4_header_t * ip0, + icmp46_header_t * icmp0) +{ + snat_session_key_t key0, sm0; + clib_bihash_kv_8_8_t kv0, value0; + u32 new_dst_addr0 = 0, old_dst_addr0, si, ti = 0; + ip_csum_t sum0; + snat_session_t *s0; + + if (!icmp_is_error_message (icmp0)) + { + icmp_echo_header_t *echo0 = (icmp_echo_header_t *)(icmp0+1); + u16 icmp_id0 = echo0->identifier; + key0.addr = ip0->dst_address; + key0.port = icmp_id0; + key0.protocol = SNAT_PROTOCOL_ICMP; + key0.fib_index = sm->outside_fib_index; + kv0.key = key0.as_u64; + + if (sm->num_workers > 1) + ti = (clib_net_to_host_u16 (icmp_id0) - 1024) / sm->port_per_thread; + else + ti = sm->num_workers; + + /* Check if destination is in active sessions */ + if (clib_bihash_search_8_8 (&sm->per_thread_data[ti].out2in, &kv0, + &value0)) + { + /* or static mappings */ + if (!snat_static_mapping_match(sm, key0, &sm0, 1, 0)) + { + new_dst_addr0 = sm0.addr.as_u32; + vnet_buffer(b0)->sw_if_index[VLIB_TX] = sm0.fib_index; + } + } + else + { + si = value0.value; + + s0 = pool_elt_at_index (sm->per_thread_data[ti].sessions, si); + new_dst_addr0 = s0->in2out.addr.as_u32; + vnet_buffer(b0)->sw_if_index[VLIB_TX] = s0->in2out.fib_index; + echo0->identifier = s0->in2out.port; + sum0 = icmp0->checksum; + sum0 = ip_csum_update (sum0, icmp_id0, s0->in2out.port, + icmp_echo_header_t, identifier); + icmp0->checksum = ip_csum_fold (sum0); + } + + /* Destination is behind the same NAT, use internal address and port */ + if (new_dst_addr0) + { + old_dst_addr0 = ip0->dst_address.as_u32; + ip0->dst_address.as_u32 = new_dst_addr0; + sum0 = ip0->checksum; + sum0 = ip_csum_update (sum0, old_dst_addr0, new_dst_addr0, + ip4_header_t, dst_address); + ip0->checksum = ip_csum_fold (sum0); + } + } + +} + +static inline u32 icmp_in2out_slow_path (snat_main_t *sm, + vlib_buffer_t * b0, + ip4_header_t * ip0, + icmp46_header_t * icmp0, + u32 sw_if_index0, + u32 rx_fib_index0, + vlib_node_runtime_t * node, + u32 next0, + f64 now, + u32 thread_index, + snat_session_t ** p_s0) +{ + next0 = icmp_in2out(sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, node, + next0, thread_index, p_s0, 0); + snat_session_t * s0 = *p_s0; + if (PREDICT_TRUE(next0 != SNAT_IN2OUT_NEXT_DROP && s0)) + { + /* Hairpinning */ + if (vnet_buffer(b0)->sw_if_index[VLIB_TX] == 0) + snat_icmp_hairpinning(sm, b0, ip0, icmp0); + /* Accounting */ + s0->last_heard = now; + s0->total_pkts++; + s0->total_bytes += vlib_buffer_length_in_chain (sm->vlib_main, b0); + /* Per-user LRU list maintenance for dynamic translations */ + if (!snat_is_session_static (s0)) + { + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, + s0->per_user_index); + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, + s0->per_user_list_head_index, + s0->per_user_index); + } + } + return next0; +} +static inline void +snat_hairpinning_unknown_proto (snat_main_t *sm, + vlib_buffer_t * b, + ip4_header_t * ip) +{ + u32 old_addr, new_addr = 0, ti = 0; + clib_bihash_kv_8_8_t kv, value; + clib_bihash_kv_16_8_t s_kv, s_value; + nat_ed_ses_key_t key; + snat_session_key_t m_key; + snat_static_mapping_t *m; + ip_csum_t sum; + snat_session_t *s; + + old_addr = ip->dst_address.as_u32; + key.l_addr.as_u32 = ip->dst_address.as_u32; + key.r_addr.as_u32 = ip->src_address.as_u32; + key.fib_index = sm->outside_fib_index; + key.proto = ip->protocol; + key.rsvd = 0; + key.l_port = 0; + s_kv.key[0] = key.as_u64[0]; + s_kv.key[1] = key.as_u64[1]; + if (clib_bihash_search_16_8 (&sm->out2in_ed, &s_kv, &s_value)) + { + m_key.addr = ip->dst_address; + m_key.fib_index = sm->outside_fib_index; + m_key.port = 0; + m_key.protocol = 0; + kv.key = m_key.as_u64; + if (clib_bihash_search_8_8 (&sm->static_mapping_by_external, &kv, &value)) + return; + + m = pool_elt_at_index (sm->static_mappings, value.value); + if (vnet_buffer(b)->sw_if_index[VLIB_TX] == ~0) + vnet_buffer(b)->sw_if_index[VLIB_TX] = m->fib_index; + new_addr = ip->dst_address.as_u32 = m->local_addr.as_u32; + } + else + { + if (sm->num_workers > 1) + ti = sm->worker_out2in_cb (ip, sm->outside_fib_index); + else + ti = sm->num_workers; + + s = pool_elt_at_index (sm->per_thread_data[ti].sessions, s_value.value); + if (vnet_buffer(b)->sw_if_index[VLIB_TX] == ~0) + vnet_buffer(b)->sw_if_index[VLIB_TX] = s->in2out.fib_index; + new_addr = ip->dst_address.as_u32 = s->in2out.addr.as_u32; + } + sum = ip->checksum; + sum = ip_csum_update (sum, old_addr, new_addr, ip4_header_t, dst_address); + ip->checksum = ip_csum_fold (sum); +} + +static snat_session_t * +snat_in2out_unknown_proto (snat_main_t *sm, + vlib_buffer_t * b, + ip4_header_t * ip, + u32 rx_fib_index, + u32 thread_index, + f64 now, + vlib_main_t * vm, + vlib_node_runtime_t * node) +{ + clib_bihash_kv_8_8_t kv, value; + clib_bihash_kv_16_8_t s_kv, s_value; + snat_static_mapping_t *m; + snat_session_key_t m_key; + u32 old_addr, new_addr = 0; + ip_csum_t sum; + snat_user_key_t u_key; + snat_user_t *u; + dlist_elt_t *head, *elt, *oldest; + snat_main_per_thread_data_t *tsm = &sm->per_thread_data[thread_index]; + u32 elt_index, head_index, ses_index, oldest_index; + snat_session_t * s; + nat_ed_ses_key_t key; + u32 address_index = ~0; + int i; + u8 is_sm = 0; + + old_addr = ip->src_address.as_u32; + + key.l_addr = ip->src_address; + key.r_addr = ip->dst_address; + key.fib_index = rx_fib_index; + key.proto = ip->protocol; + key.rsvd = 0; + key.l_port = 0; + s_kv.key[0] = key.as_u64[0]; + s_kv.key[1] = key.as_u64[1]; + + if (!clib_bihash_search_16_8 (&sm->in2out_ed, &s_kv, &s_value)) + { + s = pool_elt_at_index (tsm->sessions, s_value.value); + new_addr = ip->src_address.as_u32 = s->out2in.addr.as_u32; + } + else + { + if (PREDICT_FALSE (maximum_sessions_exceeded(sm, thread_index))) + { + b->error = node->errors[SNAT_IN2OUT_ERROR_MAX_SESSIONS_EXCEEDED]; + return 0; + } + + u_key.addr = ip->src_address; + u_key.fib_index = rx_fib_index; + kv.key = u_key.as_u64; + + /* Ever heard of the "user" = src ip4 address before? */ + if (clib_bihash_search_8_8 (&tsm->user_hash, &kv, &value)) + { + /* no, make a new one */ + pool_get (tsm->users, u); + memset (u, 0, sizeof (*u)); + u->addr = ip->src_address; + u->fib_index = rx_fib_index; + + pool_get (tsm->list_pool, head); + u->sessions_per_user_list_head_index = head - tsm->list_pool; + + clib_dlist_init (tsm->list_pool, + u->sessions_per_user_list_head_index); + + kv.value = u - tsm->users; + + /* add user */ + clib_bihash_add_del_8_8 (&tsm->user_hash, &kv, 1); + } + else + { + u = pool_elt_at_index (tsm->users, value.value); + } + + m_key.addr = ip->src_address; + m_key.port = 0; + m_key.protocol = 0; + m_key.fib_index = rx_fib_index; + kv.key = m_key.as_u64; + + /* Try to find static mapping first */ + if (!clib_bihash_search_8_8 (&sm->static_mapping_by_local, &kv, &value)) + { + m = pool_elt_at_index (sm->static_mappings, value.value); + new_addr = ip->src_address.as_u32 = m->external_addr.as_u32; + is_sm = 1; + goto create_ses; + } + /* Fallback to 3-tuple key */ + else + { + /* Choose same out address as for TCP/UDP session to same destination */ + if (!clib_bihash_search_8_8 (&tsm->user_hash, &kv, &value)) + { + head_index = u->sessions_per_user_list_head_index; + head = pool_elt_at_index (tsm->list_pool, head_index); + elt_index = head->next; + elt = pool_elt_at_index (tsm->list_pool, elt_index); + ses_index = elt->value; + while (ses_index != ~0) + { + s = pool_elt_at_index (tsm->sessions, ses_index); + elt_index = elt->next; + elt = pool_elt_at_index (tsm->list_pool, elt_index); + ses_index = elt->value; + + if (s->ext_host_addr.as_u32 == ip->dst_address.as_u32) + { + new_addr = ip->src_address.as_u32 = s->out2in.addr.as_u32; + address_index = s->outside_address_index; + + key.fib_index = sm->outside_fib_index; + key.l_addr.as_u32 = new_addr; + s_kv.key[0] = key.as_u64[0]; + s_kv.key[1] = key.as_u64[1]; + if (clib_bihash_search_16_8 (&sm->out2in_ed, &s_kv, &s_value)) + break; + + goto create_ses; + } + } + } + key.fib_index = sm->outside_fib_index; + for (i = 0; i < vec_len (sm->addresses); i++) + { + key.l_addr.as_u32 = sm->addresses[i].addr.as_u32; + s_kv.key[0] = key.as_u64[0]; + s_kv.key[1] = key.as_u64[1]; + if (clib_bihash_search_16_8 (&sm->out2in_ed, &s_kv, &s_value)) + { + new_addr = ip->src_address.as_u32 = key.l_addr.as_u32; + address_index = i; + goto create_ses; + } + } + return 0; + } + +create_ses: + /* Over quota? Recycle the least recently used dynamic translation */ + if (u->nsessions >= sm->max_translations_per_user && !is_sm) + { + /* Remove the oldest dynamic translation */ + do { + oldest_index = clib_dlist_remove_head ( + tsm->list_pool, u->sessions_per_user_list_head_index); + + ASSERT (oldest_index != ~0); + + /* add it back to the end of the LRU list */ + clib_dlist_addtail (tsm->list_pool, + u->sessions_per_user_list_head_index, + oldest_index); + /* Get the list element */ + oldest = pool_elt_at_index (tsm->list_pool, oldest_index); + + /* Get the session index from the list element */ + ses_index = oldest->value; + + /* Get the session */ + s = pool_elt_at_index (tsm->sessions, ses_index); + } while (snat_is_session_static (s)); + + if (snat_is_unk_proto_session (s)) + { + /* Remove from lookup tables */ + key.l_addr = s->in2out.addr; + key.r_addr = s->ext_host_addr; + key.fib_index = s->in2out.fib_index; + key.proto = s->in2out.port; + s_kv.key[0] = key.as_u64[0]; + s_kv.key[1] = key.as_u64[1]; + if (clib_bihash_add_del_16_8 (&sm->in2out_ed, &s_kv, 0)) + clib_warning ("in2out key del failed"); + + key.l_addr = s->out2in.addr; + key.fib_index = s->out2in.fib_index; + s_kv.key[0] = key.as_u64[0]; + s_kv.key[1] = key.as_u64[1]; + if (clib_bihash_add_del_16_8 (&sm->out2in_ed, &s_kv, 0)) + clib_warning ("out2in key del failed"); + } + else + { + /* log NAT event */ + snat_ipfix_logging_nat44_ses_delete(s->in2out.addr.as_u32, + s->out2in.addr.as_u32, + s->in2out.protocol, + s->in2out.port, + s->out2in.port, + s->in2out.fib_index); + + snat_free_outside_address_and_port (sm, thread_index, &s->out2in, + s->outside_address_index); + + /* Remove in2out, out2in keys */ + kv.key = s->in2out.as_u64; + if (clib_bihash_add_del_8_8 ( + &sm->per_thread_data[thread_index].in2out, &kv, 0)) + clib_warning ("in2out key del failed"); + kv.key = s->out2in.as_u64; + if (clib_bihash_add_del_8_8 ( + &sm->per_thread_data[thread_index].out2in, &kv, 0)) + clib_warning ("out2in key del failed"); + } + } + else + { + /* Create a new session */ + pool_get (tsm->sessions, s); + memset (s, 0, sizeof (*s)); + + /* Create list elts */ + pool_get (tsm->list_pool, elt); + clib_dlist_init (tsm->list_pool, elt - tsm->list_pool); + elt->value = s - tsm->sessions; + s->per_user_index = elt - tsm->list_pool; + s->per_user_list_head_index = u->sessions_per_user_list_head_index; + clib_dlist_addtail (tsm->list_pool, s->per_user_list_head_index, + s->per_user_index); + } + + s->ext_host_addr.as_u32 = ip->dst_address.as_u32; + s->flags |= SNAT_SESSION_FLAG_UNKNOWN_PROTO; + s->outside_address_index = address_index; + s->out2in.addr.as_u32 = new_addr; + s->out2in.fib_index = sm->outside_fib_index; + s->in2out.addr.as_u32 = old_addr; + s->in2out.fib_index = rx_fib_index; + s->in2out.port = s->out2in.port = ip->protocol; + if (is_sm) + { + u->nstaticsessions++; + s->flags |= SNAT_SESSION_FLAG_STATIC_MAPPING; + } + else + { + u->nsessions++; + } + + /* Add to lookup tables */ + key.l_addr.as_u32 = old_addr; + key.r_addr = ip->dst_address; + key.proto = ip->protocol; + key.fib_index = rx_fib_index; + s_kv.key[0] = key.as_u64[0]; + s_kv.key[1] = key.as_u64[1]; + s_kv.value = s - tsm->sessions; + if (clib_bihash_add_del_16_8 (&sm->in2out_ed, &s_kv, 1)) + clib_warning ("in2out key add failed"); + + key.l_addr.as_u32 = new_addr; + key.fib_index = sm->outside_fib_index; + s_kv.key[0] = key.as_u64[0]; + s_kv.key[1] = key.as_u64[1]; + if (clib_bihash_add_del_16_8 (&sm->out2in_ed, &s_kv, 1)) + clib_warning ("out2in key add failed"); + } + + /* Update IP checksum */ + sum = ip->checksum; + sum = ip_csum_update (sum, old_addr, new_addr, ip4_header_t, src_address); + ip->checksum = ip_csum_fold (sum); + + /* Accounting */ + s->last_heard = now; + s->total_pkts++; + s->total_bytes += vlib_buffer_length_in_chain (vm, b); + /* Per-user LRU list maintenance */ + clib_dlist_remove (tsm->list_pool, s->per_user_index); + clib_dlist_addtail (tsm->list_pool, s->per_user_list_head_index, + s->per_user_index); + + /* Hairpinning */ + if (vnet_buffer(b)->sw_if_index[VLIB_TX] == ~0) + snat_hairpinning_unknown_proto(sm, b, ip); + + if (vnet_buffer(b)->sw_if_index[VLIB_TX] == ~0) + vnet_buffer(b)->sw_if_index[VLIB_TX] = sm->outside_fib_index; + + return s; +} + +static snat_session_t * +snat_in2out_lb (snat_main_t *sm, + vlib_buffer_t * b, + ip4_header_t * ip, + u32 rx_fib_index, + u32 thread_index, + f64 now, + vlib_main_t * vm, + vlib_node_runtime_t * node) +{ + nat_ed_ses_key_t key; + clib_bihash_kv_16_8_t s_kv, s_value; + udp_header_t *udp = ip4_next_header (ip); + tcp_header_t *tcp = (tcp_header_t *) udp; + snat_session_t *s = 0; + snat_main_per_thread_data_t *tsm = &sm->per_thread_data[thread_index]; + u32 old_addr, new_addr; + u16 new_port, old_port; + ip_csum_t sum; + u32 proto = ip_proto_to_snat_proto (ip->protocol); + snat_session_key_t e_key, l_key; + clib_bihash_kv_8_8_t kv, value; + snat_user_key_t u_key; + snat_user_t *u; + dlist_elt_t *head, *elt; + + old_addr = ip->src_address.as_u32; + + key.l_addr = ip->src_address; + key.r_addr = ip->dst_address; + key.fib_index = rx_fib_index; + key.proto = ip->protocol; + key.rsvd = 0; + key.l_port = udp->src_port; + s_kv.key[0] = key.as_u64[0]; + s_kv.key[1] = key.as_u64[1]; + + if (!clib_bihash_search_16_8 (&sm->in2out_ed, &s_kv, &s_value)) + { + s = pool_elt_at_index (tsm->sessions, s_value.value); + } + else + { + if (PREDICT_FALSE (maximum_sessions_exceeded (sm, thread_index))) + { + b->error = node->errors[SNAT_IN2OUT_ERROR_MAX_SESSIONS_EXCEEDED]; + return 0; + } + + l_key.addr = ip->src_address; + l_key.port = udp->src_port; + l_key.protocol = proto; + l_key.fib_index = rx_fib_index; + if (snat_static_mapping_match(sm, l_key, &e_key, 0, 0)) + return 0; + + u_key.addr = ip->src_address; + u_key.fib_index = rx_fib_index; + kv.key = u_key.as_u64; + + /* Ever heard of the "user" = src ip4 address before? */ + if (clib_bihash_search_8_8 (&tsm->user_hash, &kv, &value)) + { + /* no, make a new one */ + pool_get (tsm->users, u); + memset (u, 0, sizeof (*u)); + u->addr = ip->src_address; + u->fib_index = rx_fib_index; + + pool_get (tsm->list_pool, head); + u->sessions_per_user_list_head_index = head - tsm->list_pool; + + clib_dlist_init (tsm->list_pool, + u->sessions_per_user_list_head_index); + + kv.value = u - tsm->users; + + /* add user */ + if (clib_bihash_add_del_8_8 (&tsm->user_hash, &kv, 1)) + clib_warning ("user key add failed"); + } + else + { + u = pool_elt_at_index (tsm->users, value.value); + } + + /* Create a new session */ + pool_get (tsm->sessions, s); + memset (s, 0, sizeof (*s)); + + s->ext_host_addr.as_u32 = ip->dst_address.as_u32; + s->flags |= SNAT_SESSION_FLAG_STATIC_MAPPING; + s->flags |= SNAT_SESSION_FLAG_LOAD_BALANCING; + s->outside_address_index = ~0; + s->in2out = l_key; + s->out2in = e_key; + u->nstaticsessions++; + + /* Create list elts */ + pool_get (tsm->list_pool, elt); + clib_dlist_init (tsm->list_pool, elt - tsm->list_pool); + elt->value = s - tsm->sessions; + s->per_user_index = elt - tsm->list_pool; + s->per_user_list_head_index = u->sessions_per_user_list_head_index; + clib_dlist_addtail (tsm->list_pool, s->per_user_list_head_index, + s->per_user_index); + + /* Add to lookup tables */ + s_kv.value = s - tsm->sessions; + if (clib_bihash_add_del_16_8 (&sm->in2out_ed, &s_kv, 1)) + clib_warning ("in2out-ed key add failed"); + + key.l_addr = e_key.addr; + key.fib_index = e_key.fib_index; + key.l_port = e_key.port; + s_kv.key[0] = key.as_u64[0]; + s_kv.key[1] = key.as_u64[1]; + if (clib_bihash_add_del_16_8 (&sm->out2in_ed, &s_kv, 1)) + clib_warning ("out2in-ed key add failed"); + } + + new_addr = ip->src_address.as_u32 = s->out2in.addr.as_u32; + + /* Update IP checksum */ + sum = ip->checksum; + sum = ip_csum_update (sum, old_addr, new_addr, ip4_header_t, src_address); + ip->checksum = ip_csum_fold (sum); + + if (PREDICT_TRUE(proto == SNAT_PROTOCOL_TCP)) + { + old_port = tcp->src_port; + tcp->src_port = s->out2in.port; + new_port = tcp->src_port; + + sum = tcp->checksum; + sum = ip_csum_update (sum, old_addr, new_addr, ip4_header_t, src_address); + sum = ip_csum_update (sum, old_port, new_port, ip4_header_t, length); + tcp->checksum = ip_csum_fold(sum); + } + else + { + udp->src_port = s->out2in.port; + udp->checksum = 0; + } + + if (vnet_buffer(b)->sw_if_index[VLIB_TX] == ~0) + vnet_buffer(b)->sw_if_index[VLIB_TX] = sm->outside_fib_index; + + /* Accounting */ + s->last_heard = now; + s->total_pkts++; + s->total_bytes += vlib_buffer_length_in_chain (vm, b); + return s; +} + +static inline uword +snat_in2out_node_fn_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame, int is_slow_path, + int is_output_feature) +{ + u32 n_left_from, * from, * to_next; + snat_in2out_next_t next_index; + u32 pkts_processed = 0; + snat_main_t * sm = &snat_main; + f64 now = vlib_time_now (vm); + u32 stats_node_index; + u32 thread_index = vlib_get_thread_index (); + + stats_node_index = is_slow_path ? snat_in2out_slowpath_node.index : + snat_in2out_node.index; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, + to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next >= 2) + { + u32 bi0, bi1; + vlib_buffer_t * b0, * b1; + u32 next0, next1; + u32 sw_if_index0, sw_if_index1; + ip4_header_t * ip0, * ip1; + ip_csum_t sum0, sum1; + u32 new_addr0, old_addr0, new_addr1, old_addr1; + u16 old_port0, new_port0, old_port1, new_port1; + udp_header_t * udp0, * udp1; + tcp_header_t * tcp0, * tcp1; + icmp46_header_t * icmp0, * icmp1; + snat_session_key_t key0, key1; + u32 rx_fib_index0, rx_fib_index1; + u32 proto0, proto1; + snat_session_t * s0 = 0, * s1 = 0; + clib_bihash_kv_8_8_t kv0, value0, kv1, value1; + u32 iph_offset0 = 0, iph_offset1 = 0; + + /* Prefetch next iteration. */ + { + vlib_buffer_t * p2, * p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + + CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE); + } + + /* speculatively enqueue b0 and b1 to the current next frame */ + to_next[0] = bi0 = from[0]; + to_next[1] = bi1 = from[1]; + from += 2; + to_next += 2; + n_left_from -= 2; + n_left_to_next -= 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + if (is_output_feature) + iph_offset0 = vnet_buffer (b0)->ip.save_rewrite_length; + + ip0 = (ip4_header_t *) ((u8 *) vlib_buffer_get_current (b0) + + iph_offset0); + + udp0 = ip4_next_header (ip0); + tcp0 = (tcp_header_t *) udp0; + icmp0 = (icmp46_header_t *) udp0; + + sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX]; + rx_fib_index0 = vec_elt (sm->ip4_main->fib_index_by_sw_if_index, + sw_if_index0); + + next0 = next1 = SNAT_IN2OUT_NEXT_LOOKUP; + + if (PREDICT_FALSE(ip0->ttl == 1)) + { + vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; + icmp4_error_set_vnet_buffer (b0, ICMP4_time_exceeded, + ICMP4_time_exceeded_ttl_exceeded_in_transit, + 0); + next0 = SNAT_IN2OUT_NEXT_ICMP_ERROR; + goto trace00; + } + + proto0 = ip_proto_to_snat_proto (ip0->protocol); + + /* Next configured feature, probably ip4-lookup */ + if (is_slow_path) + { + if (PREDICT_FALSE (proto0 == ~0)) + { + s0 = snat_in2out_unknown_proto (sm, b0, ip0, rx_fib_index0, + thread_index, now, vm, node); + if (!s0) + next0 = SNAT_IN2OUT_NEXT_DROP; + goto trace00; + } + + if (PREDICT_FALSE (proto0 == SNAT_PROTOCOL_ICMP)) + { + next0 = icmp_in2out_slow_path + (sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, + node, next0, now, thread_index, &s0); + goto trace00; + } + } + else + { + if (PREDICT_FALSE (proto0 == ~0 || proto0 == SNAT_PROTOCOL_ICMP)) + { + next0 = SNAT_IN2OUT_NEXT_SLOW_PATH; + goto trace00; + } + } + + key0.addr = ip0->src_address; + key0.port = udp0->src_port; + key0.protocol = proto0; + key0.fib_index = rx_fib_index0; + + kv0.key = key0.as_u64; + + if (PREDICT_FALSE (clib_bihash_search_8_8 ( + &sm->per_thread_data[thread_index].in2out, &kv0, &value0) != 0)) + { + if (is_slow_path) + { + if (PREDICT_FALSE(snat_not_translate(sm, node, sw_if_index0, + ip0, proto0, rx_fib_index0, thread_index)) && !is_output_feature) + goto trace00; + + next0 = slow_path (sm, b0, ip0, rx_fib_index0, &key0, + &s0, node, next0, thread_index); + if (PREDICT_FALSE (next0 == SNAT_IN2OUT_NEXT_DROP)) + goto trace00; + } + else + { + next0 = SNAT_IN2OUT_NEXT_SLOW_PATH; + goto trace00; + } + } + else + { + if (PREDICT_FALSE (value0.value == ~0ULL)) + { + if (is_slow_path) + { + s0 = snat_in2out_lb(sm, b0, ip0, rx_fib_index0, + thread_index, now, vm, node); + if (!s0) + next0 = SNAT_IN2OUT_NEXT_DROP; + goto trace00; + } + else + { + next0 = SNAT_IN2OUT_NEXT_SLOW_PATH; + goto trace00; + } + } + else + { + s0 = pool_elt_at_index ( + sm->per_thread_data[thread_index].sessions, + value0.value); + } + } + + old_addr0 = ip0->src_address.as_u32; + ip0->src_address = s0->out2in.addr; + new_addr0 = ip0->src_address.as_u32; + if (!is_output_feature) + vnet_buffer(b0)->sw_if_index[VLIB_TX] = s0->out2in.fib_index; + + sum0 = ip0->checksum; + sum0 = ip_csum_update (sum0, old_addr0, new_addr0, + ip4_header_t, + src_address /* changed member */); + ip0->checksum = ip_csum_fold (sum0); + + if (PREDICT_TRUE(proto0 == SNAT_PROTOCOL_TCP)) + { + old_port0 = tcp0->src_port; + tcp0->src_port = s0->out2in.port; + new_port0 = tcp0->src_port; + + sum0 = tcp0->checksum; + sum0 = ip_csum_update (sum0, old_addr0, new_addr0, + ip4_header_t, + dst_address /* changed member */); + sum0 = ip_csum_update (sum0, old_port0, new_port0, + ip4_header_t /* cheat */, + length /* changed member */); + tcp0->checksum = ip_csum_fold(sum0); + } + else + { + old_port0 = udp0->src_port; + udp0->src_port = s0->out2in.port; + udp0->checksum = 0; + } + + /* Hairpinning */ + if (!is_output_feature) + snat_hairpinning (sm, b0, ip0, udp0, tcp0, proto0); + + /* Accounting */ + s0->last_heard = now; + s0->total_pkts++; + s0->total_bytes += vlib_buffer_length_in_chain (vm, b0); + /* Per-user LRU list maintenance for dynamic translation */ + if (!snat_is_session_static (s0)) + { + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, + s0->per_user_index); + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, + s0->per_user_list_head_index, + s0->per_user_index); + } + trace00: + + if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE) + && (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + snat_in2out_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->is_slow_path = is_slow_path; + t->sw_if_index = sw_if_index0; + t->next_index = next0; + t->session_index = ~0; + if (s0) + t->session_index = s0 - sm->per_thread_data[thread_index].sessions; + } + + pkts_processed += next0 != SNAT_IN2OUT_NEXT_DROP; + + if (is_output_feature) + iph_offset1 = vnet_buffer (b1)->ip.save_rewrite_length; + + ip1 = (ip4_header_t *) ((u8 *) vlib_buffer_get_current (b1) + + iph_offset1); + + udp1 = ip4_next_header (ip1); + tcp1 = (tcp_header_t *) udp1; + icmp1 = (icmp46_header_t *) udp1; + + sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_RX]; + rx_fib_index1 = vec_elt (sm->ip4_main->fib_index_by_sw_if_index, + sw_if_index1); + + if (PREDICT_FALSE(ip1->ttl == 1)) + { + vnet_buffer (b1)->sw_if_index[VLIB_TX] = (u32) ~ 0; + icmp4_error_set_vnet_buffer (b1, ICMP4_time_exceeded, + ICMP4_time_exceeded_ttl_exceeded_in_transit, + 0); + next1 = SNAT_IN2OUT_NEXT_ICMP_ERROR; + goto trace01; + } + + proto1 = ip_proto_to_snat_proto (ip1->protocol); + + /* Next configured feature, probably ip4-lookup */ + if (is_slow_path) + { + if (PREDICT_FALSE (proto1 == ~0)) + { + s1 = snat_in2out_unknown_proto (sm, b1, ip1, rx_fib_index1, + thread_index, now, vm, node); + if (!s1) + next1 = SNAT_IN2OUT_NEXT_DROP; + goto trace01; + } + + if (PREDICT_FALSE (proto1 == SNAT_PROTOCOL_ICMP)) + { + next1 = icmp_in2out_slow_path + (sm, b1, ip1, icmp1, sw_if_index1, rx_fib_index1, node, + next1, now, thread_index, &s1); + goto trace01; + } + } + else + { + if (PREDICT_FALSE (proto1 == ~0 || proto1 == SNAT_PROTOCOL_ICMP)) + { + next1 = SNAT_IN2OUT_NEXT_SLOW_PATH; + goto trace01; + } + } + + key1.addr = ip1->src_address; + key1.port = udp1->src_port; + key1.protocol = proto1; + key1.fib_index = rx_fib_index1; + + kv1.key = key1.as_u64; + + if (PREDICT_FALSE(clib_bihash_search_8_8 ( + &sm->per_thread_data[thread_index].in2out, &kv1, &value1) != 0)) + { + if (is_slow_path) + { + if (PREDICT_FALSE(snat_not_translate(sm, node, sw_if_index1, + ip1, proto1, rx_fib_index1, thread_index)) && !is_output_feature) + goto trace01; + + next1 = slow_path (sm, b1, ip1, rx_fib_index1, &key1, + &s1, node, next1, thread_index); + if (PREDICT_FALSE (next1 == SNAT_IN2OUT_NEXT_DROP)) + goto trace01; + } + else + { + next1 = SNAT_IN2OUT_NEXT_SLOW_PATH; + goto trace01; + } + } + else + { + if (PREDICT_FALSE (value1.value == ~0ULL)) + { + if (is_slow_path) + { + s1 = snat_in2out_lb(sm, b1, ip1, rx_fib_index1, + thread_index, now, vm, node); + if (!s1) + next1 = SNAT_IN2OUT_NEXT_DROP; + goto trace01; + } + else + { + next1 = SNAT_IN2OUT_NEXT_SLOW_PATH; + goto trace01; + } + } + else + { + s1 = pool_elt_at_index ( + sm->per_thread_data[thread_index].sessions, + value1.value); + } + } + + old_addr1 = ip1->src_address.as_u32; + ip1->src_address = s1->out2in.addr; + new_addr1 = ip1->src_address.as_u32; + if (!is_output_feature) + vnet_buffer(b1)->sw_if_index[VLIB_TX] = s1->out2in.fib_index; + + sum1 = ip1->checksum; + sum1 = ip_csum_update (sum1, old_addr1, new_addr1, + ip4_header_t, + src_address /* changed member */); + ip1->checksum = ip_csum_fold (sum1); + + if (PREDICT_TRUE(proto1 == SNAT_PROTOCOL_TCP)) + { + old_port1 = tcp1->src_port; + tcp1->src_port = s1->out2in.port; + new_port1 = tcp1->src_port; + + sum1 = tcp1->checksum; + sum1 = ip_csum_update (sum1, old_addr1, new_addr1, + ip4_header_t, + dst_address /* changed member */); + sum1 = ip_csum_update (sum1, old_port1, new_port1, + ip4_header_t /* cheat */, + length /* changed member */); + tcp1->checksum = ip_csum_fold(sum1); + } + else + { + old_port1 = udp1->src_port; + udp1->src_port = s1->out2in.port; + udp1->checksum = 0; + } + + /* Hairpinning */ + if (!is_output_feature) + snat_hairpinning (sm, b1, ip1, udp1, tcp1, proto1); + + /* Accounting */ + s1->last_heard = now; + s1->total_pkts++; + s1->total_bytes += vlib_buffer_length_in_chain (vm, b1); + /* Per-user LRU list maintenance for dynamic translation */ + if (!snat_is_session_static (s1)) + { + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, + s1->per_user_index); + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, + s1->per_user_list_head_index, + s1->per_user_index); + } + trace01: + + if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE) + && (b1->flags & VLIB_BUFFER_IS_TRACED))) + { + snat_in2out_trace_t *t = + vlib_add_trace (vm, node, b1, sizeof (*t)); + t->sw_if_index = sw_if_index1; + t->next_index = next1; + t->session_index = ~0; + if (s1) + t->session_index = s1 - sm->per_thread_data[thread_index].sessions; + } + + pkts_processed += next1 != SNAT_IN2OUT_NEXT_DROP; + + /* verify speculative enqueues, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, + to_next, n_left_to_next, + bi0, bi1, next0, next1); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t * b0; + u32 next0; + u32 sw_if_index0; + ip4_header_t * ip0; + ip_csum_t sum0; + u32 new_addr0, old_addr0; + u16 old_port0, new_port0; + udp_header_t * udp0; + tcp_header_t * tcp0; + icmp46_header_t * icmp0; + snat_session_key_t key0; + u32 rx_fib_index0; + u32 proto0; + snat_session_t * s0 = 0; + clib_bihash_kv_8_8_t kv0, value0; + u32 iph_offset0 = 0; + + /* speculatively enqueue b0 to the current next frame */ + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + next0 = SNAT_IN2OUT_NEXT_LOOKUP; + + if (is_output_feature) + iph_offset0 = vnet_buffer (b0)->ip.save_rewrite_length; + + ip0 = (ip4_header_t *) ((u8 *) vlib_buffer_get_current (b0) + + iph_offset0); + + udp0 = ip4_next_header (ip0); + tcp0 = (tcp_header_t *) udp0; + icmp0 = (icmp46_header_t *) udp0; + + sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX]; + rx_fib_index0 = vec_elt (sm->ip4_main->fib_index_by_sw_if_index, + sw_if_index0); + + if (PREDICT_FALSE(ip0->ttl == 1)) + { + vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; + icmp4_error_set_vnet_buffer (b0, ICMP4_time_exceeded, + ICMP4_time_exceeded_ttl_exceeded_in_transit, + 0); + next0 = SNAT_IN2OUT_NEXT_ICMP_ERROR; + goto trace0; + } + + proto0 = ip_proto_to_snat_proto (ip0->protocol); + + /* Next configured feature, probably ip4-lookup */ + if (is_slow_path) + { + if (PREDICT_FALSE (proto0 == ~0)) + { + s0 = snat_in2out_unknown_proto (sm, b0, ip0, rx_fib_index0, + thread_index, now, vm, node); + if (!s0) + next0 = SNAT_IN2OUT_NEXT_DROP; + goto trace0; + } + + if (PREDICT_FALSE (proto0 == SNAT_PROTOCOL_ICMP)) + { + next0 = icmp_in2out_slow_path + (sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, node, + next0, now, thread_index, &s0); + goto trace0; + } + } + else + { + if (PREDICT_FALSE (proto0 == ~0 || proto0 == SNAT_PROTOCOL_ICMP)) + { + next0 = SNAT_IN2OUT_NEXT_SLOW_PATH; + goto trace0; + } + } + + key0.addr = ip0->src_address; + key0.port = udp0->src_port; + key0.protocol = proto0; + key0.fib_index = rx_fib_index0; + + kv0.key = key0.as_u64; + + if (clib_bihash_search_8_8 (&sm->per_thread_data[thread_index].in2out, + &kv0, &value0)) + { + if (is_slow_path) + { + if (PREDICT_FALSE(snat_not_translate(sm, node, sw_if_index0, + ip0, proto0, rx_fib_index0, thread_index)) && !is_output_feature) + goto trace0; + + next0 = slow_path (sm, b0, ip0, rx_fib_index0, &key0, + &s0, node, next0, thread_index); + + if (PREDICT_FALSE (next0 == SNAT_IN2OUT_NEXT_DROP)) + goto trace0; + } + else + { + next0 = SNAT_IN2OUT_NEXT_SLOW_PATH; + goto trace0; + } + } + else + { + if (PREDICT_FALSE (value0.value == ~0ULL)) + { + if (is_slow_path) + { + s0 = snat_in2out_lb(sm, b0, ip0, rx_fib_index0, + thread_index, now, vm, node); + if (!s0) + next0 = SNAT_IN2OUT_NEXT_DROP; + goto trace0; + } + else + { + next0 = SNAT_IN2OUT_NEXT_SLOW_PATH; + goto trace0; + } + } + else + { + s0 = pool_elt_at_index ( + sm->per_thread_data[thread_index].sessions, + value0.value); + } + } + + old_addr0 = ip0->src_address.as_u32; + ip0->src_address = s0->out2in.addr; + new_addr0 = ip0->src_address.as_u32; + if (!is_output_feature) + vnet_buffer(b0)->sw_if_index[VLIB_TX] = s0->out2in.fib_index; + + sum0 = ip0->checksum; + sum0 = ip_csum_update (sum0, old_addr0, new_addr0, + ip4_header_t, + src_address /* changed member */); + ip0->checksum = ip_csum_fold (sum0); + + if (PREDICT_TRUE(proto0 == SNAT_PROTOCOL_TCP)) + { + old_port0 = tcp0->src_port; + tcp0->src_port = s0->out2in.port; + new_port0 = tcp0->src_port; + + sum0 = tcp0->checksum; + sum0 = ip_csum_update (sum0, old_addr0, new_addr0, + ip4_header_t, + dst_address /* changed member */); + sum0 = ip_csum_update (sum0, old_port0, new_port0, + ip4_header_t /* cheat */, + length /* changed member */); + tcp0->checksum = ip_csum_fold(sum0); + } + else + { + old_port0 = udp0->src_port; + udp0->src_port = s0->out2in.port; + udp0->checksum = 0; + } + + /* Hairpinning */ + if (!is_output_feature) + snat_hairpinning (sm, b0, ip0, udp0, tcp0, proto0); + + /* Accounting */ + s0->last_heard = now; + s0->total_pkts++; + s0->total_bytes += vlib_buffer_length_in_chain (vm, b0); + /* Per-user LRU list maintenance for dynamic translation */ + if (!snat_is_session_static (s0)) + { + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, + s0->per_user_index); + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, + s0->per_user_list_head_index, + s0->per_user_index); + } + + trace0: + if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE) + && (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + snat_in2out_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->is_slow_path = is_slow_path; + t->sw_if_index = sw_if_index0; + t->next_index = next0; + t->session_index = ~0; + if (s0) + t->session_index = s0 - sm->per_thread_data[thread_index].sessions; + } + + pkts_processed += next0 != SNAT_IN2OUT_NEXT_DROP; + + /* verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + vlib_node_increment_counter (vm, stats_node_index, + SNAT_IN2OUT_ERROR_IN2OUT_PACKETS, + pkts_processed); + return frame->n_vectors; +} + +static uword +snat_in2out_fast_path_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return snat_in2out_node_fn_inline (vm, node, frame, 0 /* is_slow_path */, 0); +} + +VLIB_REGISTER_NODE (snat_in2out_node) = { + .function = snat_in2out_fast_path_fn, + .name = "nat44-in2out", + .vector_size = sizeof (u32), + .format_trace = format_snat_in2out_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .n_errors = ARRAY_LEN(snat_in2out_error_strings), + .error_strings = snat_in2out_error_strings, + + .runtime_data_bytes = sizeof (snat_runtime_t), + + .n_next_nodes = SNAT_IN2OUT_N_NEXT, + + /* edit / add dispositions here */ + .next_nodes = { + [SNAT_IN2OUT_NEXT_DROP] = "error-drop", + [SNAT_IN2OUT_NEXT_LOOKUP] = "ip4-lookup", + [SNAT_IN2OUT_NEXT_SLOW_PATH] = "nat44-in2out-slowpath", + [SNAT_IN2OUT_NEXT_ICMP_ERROR] = "ip4-icmp-error", + }, +}; + +VLIB_NODE_FUNCTION_MULTIARCH (snat_in2out_node, snat_in2out_fast_path_fn); + +static uword +snat_in2out_output_fast_path_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return snat_in2out_node_fn_inline (vm, node, frame, 0 /* is_slow_path */, 1); +} + +VLIB_REGISTER_NODE (snat_in2out_output_node) = { + .function = snat_in2out_output_fast_path_fn, + .name = "nat44-in2out-output", + .vector_size = sizeof (u32), + .format_trace = format_snat_in2out_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .n_errors = ARRAY_LEN(snat_in2out_error_strings), + .error_strings = snat_in2out_error_strings, + + .runtime_data_bytes = sizeof (snat_runtime_t), + + .n_next_nodes = SNAT_IN2OUT_N_NEXT, + + /* edit / add dispositions here */ + .next_nodes = { + [SNAT_IN2OUT_NEXT_DROP] = "error-drop", + [SNAT_IN2OUT_NEXT_LOOKUP] = "interface-output", + [SNAT_IN2OUT_NEXT_SLOW_PATH] = "nat44-in2out-output-slowpath", + [SNAT_IN2OUT_NEXT_ICMP_ERROR] = "ip4-icmp-error", + }, +}; + +VLIB_NODE_FUNCTION_MULTIARCH (snat_in2out_output_node, + snat_in2out_output_fast_path_fn); + +static uword +snat_in2out_slow_path_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return snat_in2out_node_fn_inline (vm, node, frame, 1 /* is_slow_path */, 0); +} + +VLIB_REGISTER_NODE (snat_in2out_slowpath_node) = { + .function = snat_in2out_slow_path_fn, + .name = "nat44-in2out-slowpath", + .vector_size = sizeof (u32), + .format_trace = format_snat_in2out_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .n_errors = ARRAY_LEN(snat_in2out_error_strings), + .error_strings = snat_in2out_error_strings, + + .runtime_data_bytes = sizeof (snat_runtime_t), + + .n_next_nodes = SNAT_IN2OUT_N_NEXT, + + /* edit / add dispositions here */ + .next_nodes = { + [SNAT_IN2OUT_NEXT_DROP] = "error-drop", + [SNAT_IN2OUT_NEXT_LOOKUP] = "ip4-lookup", + [SNAT_IN2OUT_NEXT_SLOW_PATH] = "nat44-in2out-slowpath", + [SNAT_IN2OUT_NEXT_ICMP_ERROR] = "ip4-icmp-error", + }, +}; + +VLIB_NODE_FUNCTION_MULTIARCH (snat_in2out_slowpath_node, + snat_in2out_slow_path_fn); + +static uword +snat_in2out_output_slow_path_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return snat_in2out_node_fn_inline (vm, node, frame, 1 /* is_slow_path */, 1); +} + +VLIB_REGISTER_NODE (snat_in2out_output_slowpath_node) = { + .function = snat_in2out_output_slow_path_fn, + .name = "nat44-in2out-output-slowpath", + .vector_size = sizeof (u32), + .format_trace = format_snat_in2out_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .n_errors = ARRAY_LEN(snat_in2out_error_strings), + .error_strings = snat_in2out_error_strings, + + .runtime_data_bytes = sizeof (snat_runtime_t), + + .n_next_nodes = SNAT_IN2OUT_N_NEXT, + + /* edit / add dispositions here */ + .next_nodes = { + [SNAT_IN2OUT_NEXT_DROP] = "error-drop", + [SNAT_IN2OUT_NEXT_LOOKUP] = "interface-output", + [SNAT_IN2OUT_NEXT_SLOW_PATH] = "nat44-in2out-output-slowpath", + [SNAT_IN2OUT_NEXT_ICMP_ERROR] = "ip4-icmp-error", + }, +}; + +VLIB_NODE_FUNCTION_MULTIARCH (snat_in2out_output_slowpath_node, + snat_in2out_output_slow_path_fn); + +/**************************/ +/*** deterministic mode ***/ +/**************************/ +static uword +snat_det_in2out_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + u32 n_left_from, * from, * to_next; + snat_in2out_next_t next_index; + u32 pkts_processed = 0; + snat_main_t * sm = &snat_main; + u32 now = (u32) vlib_time_now (vm); + u32 thread_index = vlib_get_thread_index (); + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, + to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next >= 2) + { + u32 bi0, bi1; + vlib_buffer_t * b0, * b1; + u32 next0, next1; + u32 sw_if_index0, sw_if_index1; + ip4_header_t * ip0, * ip1; + ip_csum_t sum0, sum1; + ip4_address_t new_addr0, old_addr0, new_addr1, old_addr1; + u16 old_port0, new_port0, lo_port0, i0; + u16 old_port1, new_port1, lo_port1, i1; + udp_header_t * udp0, * udp1; + tcp_header_t * tcp0, * tcp1; + u32 proto0, proto1; + snat_det_out_key_t key0, key1; + snat_det_map_t * dm0, * dm1; + snat_det_session_t * ses0 = 0, * ses1 = 0; + u32 rx_fib_index0, rx_fib_index1; + icmp46_header_t * icmp0, * icmp1; + + /* Prefetch next iteration. */ + { + vlib_buffer_t * p2, * p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + + CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE); + } + + /* speculatively enqueue b0 and b1 to the current next frame */ + to_next[0] = bi0 = from[0]; + to_next[1] = bi1 = from[1]; + from += 2; + to_next += 2; + n_left_from -= 2; + n_left_to_next -= 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + next0 = SNAT_IN2OUT_NEXT_LOOKUP; + next1 = SNAT_IN2OUT_NEXT_LOOKUP; + + ip0 = vlib_buffer_get_current (b0); + udp0 = ip4_next_header (ip0); + tcp0 = (tcp_header_t *) udp0; + + sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX]; + + if (PREDICT_FALSE(ip0->ttl == 1)) + { + vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; + icmp4_error_set_vnet_buffer (b0, ICMP4_time_exceeded, + ICMP4_time_exceeded_ttl_exceeded_in_transit, + 0); + next0 = SNAT_IN2OUT_NEXT_ICMP_ERROR; + goto trace0; + } + + proto0 = ip_proto_to_snat_proto (ip0->protocol); + + if (PREDICT_FALSE(proto0 == SNAT_PROTOCOL_ICMP)) + { + rx_fib_index0 = ip4_fib_table_get_index_for_sw_if_index(sw_if_index0); + icmp0 = (icmp46_header_t *) udp0; + + next0 = icmp_in2out(sm, b0, ip0, icmp0, sw_if_index0, + rx_fib_index0, node, next0, thread_index, + &ses0, &dm0); + goto trace0; + } + + dm0 = snat_det_map_by_user(sm, &ip0->src_address); + if (PREDICT_FALSE(!dm0)) + { + clib_warning("no match for internal host %U", + format_ip4_address, &ip0->src_address); + next0 = SNAT_IN2OUT_NEXT_DROP; + b0->error = node->errors[SNAT_IN2OUT_ERROR_NO_TRANSLATION]; + goto trace0; + } + + snat_det_forward(dm0, &ip0->src_address, &new_addr0, &lo_port0); + + key0.ext_host_addr = ip0->dst_address; + key0.ext_host_port = tcp0->dst; + + ses0 = snat_det_find_ses_by_in(dm0, &ip0->src_address, tcp0->src, key0); + if (PREDICT_FALSE(!ses0)) + { + for (i0 = 0; i0 < dm0->ports_per_host; i0++) + { + key0.out_port = clib_host_to_net_u16 (lo_port0 + + ((i0 + clib_net_to_host_u16 (tcp0->src)) % dm0->ports_per_host)); + + if (snat_det_get_ses_by_out (dm0, &ip0->src_address, key0.as_u64)) + continue; + + ses0 = snat_det_ses_create(dm0, &ip0->src_address, tcp0->src, &key0); + break; + } + if (PREDICT_FALSE(!ses0)) + { + /* too many sessions for user, send ICMP error packet */ + + vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; + icmp4_error_set_vnet_buffer (b0, ICMP4_destination_unreachable, + ICMP4_destination_unreachable_destination_unreachable_host, + 0); + next0 = SNAT_IN2OUT_NEXT_ICMP_ERROR; + goto trace0; + } + } + + new_port0 = ses0->out.out_port; + + old_addr0.as_u32 = ip0->src_address.as_u32; + ip0->src_address.as_u32 = new_addr0.as_u32; + vnet_buffer(b0)->sw_if_index[VLIB_TX] = sm->outside_fib_index; + + sum0 = ip0->checksum; + sum0 = ip_csum_update (sum0, old_addr0.as_u32, new_addr0.as_u32, + ip4_header_t, + src_address /* changed member */); + ip0->checksum = ip_csum_fold (sum0); + + if (PREDICT_TRUE(proto0 == SNAT_PROTOCOL_TCP)) + { + if (tcp0->flags & TCP_FLAG_SYN) + ses0->state = SNAT_SESSION_TCP_SYN_SENT; + else if (tcp0->flags & TCP_FLAG_ACK && ses0->state == SNAT_SESSION_TCP_SYN_SENT) + ses0->state = SNAT_SESSION_TCP_ESTABLISHED; + else if (tcp0->flags & TCP_FLAG_FIN && ses0->state == SNAT_SESSION_TCP_ESTABLISHED) + ses0->state = SNAT_SESSION_TCP_FIN_WAIT; + else if (tcp0->flags & TCP_FLAG_ACK && ses0->state == SNAT_SESSION_TCP_FIN_WAIT) + snat_det_ses_close(dm0, ses0); + else if (tcp0->flags & TCP_FLAG_FIN && ses0->state == SNAT_SESSION_TCP_CLOSE_WAIT) + ses0->state = SNAT_SESSION_TCP_LAST_ACK; + else if (tcp0->flags == 0 && ses0->state == SNAT_SESSION_UNKNOWN) + ses0->state = SNAT_SESSION_TCP_ESTABLISHED; + + old_port0 = tcp0->src; + tcp0->src = new_port0; + + sum0 = tcp0->checksum; + sum0 = ip_csum_update (sum0, old_addr0.as_u32, new_addr0.as_u32, + ip4_header_t, + dst_address /* changed member */); + sum0 = ip_csum_update (sum0, old_port0, new_port0, + ip4_header_t /* cheat */, + length /* changed member */); + tcp0->checksum = ip_csum_fold(sum0); + } + else + { + ses0->state = SNAT_SESSION_UDP_ACTIVE; + old_port0 = udp0->src_port; + udp0->src_port = new_port0; + udp0->checksum = 0; + } + + switch(ses0->state) + { + case SNAT_SESSION_UDP_ACTIVE: + ses0->expire = now + sm->udp_timeout; + break; + case SNAT_SESSION_TCP_SYN_SENT: + case SNAT_SESSION_TCP_FIN_WAIT: + case SNAT_SESSION_TCP_CLOSE_WAIT: + case SNAT_SESSION_TCP_LAST_ACK: + ses0->expire = now + sm->tcp_transitory_timeout; + break; + case SNAT_SESSION_TCP_ESTABLISHED: + ses0->expire = now + sm->tcp_established_timeout; + break; + } + + trace0: + if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE) + && (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + snat_in2out_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->is_slow_path = 0; + t->sw_if_index = sw_if_index0; + t->next_index = next0; + t->session_index = ~0; + if (ses0) + t->session_index = ses0 - dm0->sessions; + } + + pkts_processed += next0 != SNAT_IN2OUT_NEXT_DROP; + + ip1 = vlib_buffer_get_current (b1); + udp1 = ip4_next_header (ip1); + tcp1 = (tcp_header_t *) udp1; + + sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_RX]; + + if (PREDICT_FALSE(ip1->ttl == 1)) + { + vnet_buffer (b1)->sw_if_index[VLIB_TX] = (u32) ~ 0; + icmp4_error_set_vnet_buffer (b1, ICMP4_time_exceeded, + ICMP4_time_exceeded_ttl_exceeded_in_transit, + 0); + next1 = SNAT_IN2OUT_NEXT_ICMP_ERROR; + goto trace1; + } + + proto1 = ip_proto_to_snat_proto (ip1->protocol); + + if (PREDICT_FALSE(proto1 == SNAT_PROTOCOL_ICMP)) + { + rx_fib_index1 = ip4_fib_table_get_index_for_sw_if_index(sw_if_index1); + icmp1 = (icmp46_header_t *) udp1; + + next1 = icmp_in2out(sm, b1, ip1, icmp1, sw_if_index1, + rx_fib_index1, node, next1, thread_index, + &ses1, &dm1); + goto trace1; + } + + dm1 = snat_det_map_by_user(sm, &ip1->src_address); + if (PREDICT_FALSE(!dm1)) + { + clib_warning("no match for internal host %U", + format_ip4_address, &ip0->src_address); + next1 = SNAT_IN2OUT_NEXT_DROP; + b1->error = node->errors[SNAT_IN2OUT_ERROR_NO_TRANSLATION]; + goto trace1; + } + + snat_det_forward(dm1, &ip1->src_address, &new_addr1, &lo_port1); + + key1.ext_host_addr = ip1->dst_address; + key1.ext_host_port = tcp1->dst; + + ses1 = snat_det_find_ses_by_in(dm1, &ip1->src_address, tcp1->src, key1); + if (PREDICT_FALSE(!ses1)) + { + for (i1 = 0; i1 < dm1->ports_per_host; i1++) + { + key1.out_port = clib_host_to_net_u16 (lo_port1 + + ((i1 + clib_net_to_host_u16 (tcp1->src)) % dm1->ports_per_host)); + + if (snat_det_get_ses_by_out (dm1, &ip1->src_address, key1.as_u64)) + continue; + + ses1 = snat_det_ses_create(dm1, &ip1->src_address, tcp1->src, &key1); + break; + } + if (PREDICT_FALSE(!ses1)) + { + /* too many sessions for user, send ICMP error packet */ + + vnet_buffer (b1)->sw_if_index[VLIB_TX] = (u32) ~ 0; + icmp4_error_set_vnet_buffer (b1, ICMP4_destination_unreachable, + ICMP4_destination_unreachable_destination_unreachable_host, + 0); + next1 = SNAT_IN2OUT_NEXT_ICMP_ERROR; + goto trace1; + } + } + + new_port1 = ses1->out.out_port; + + old_addr1.as_u32 = ip1->src_address.as_u32; + ip1->src_address.as_u32 = new_addr1.as_u32; + vnet_buffer(b1)->sw_if_index[VLIB_TX] = sm->outside_fib_index; + + sum1 = ip1->checksum; + sum1 = ip_csum_update (sum1, old_addr1.as_u32, new_addr1.as_u32, + ip4_header_t, + src_address /* changed member */); + ip1->checksum = ip_csum_fold (sum1); + + if (PREDICT_TRUE(proto1 == SNAT_PROTOCOL_TCP)) + { + if (tcp1->flags & TCP_FLAG_SYN) + ses1->state = SNAT_SESSION_TCP_SYN_SENT; + else if (tcp1->flags & TCP_FLAG_ACK && ses1->state == SNAT_SESSION_TCP_SYN_SENT) + ses1->state = SNAT_SESSION_TCP_ESTABLISHED; + else if (tcp1->flags & TCP_FLAG_FIN && ses1->state == SNAT_SESSION_TCP_ESTABLISHED) + ses1->state = SNAT_SESSION_TCP_FIN_WAIT; + else if (tcp1->flags & TCP_FLAG_ACK && ses1->state == SNAT_SESSION_TCP_FIN_WAIT) + snat_det_ses_close(dm1, ses1); + else if (tcp1->flags & TCP_FLAG_FIN && ses1->state == SNAT_SESSION_TCP_CLOSE_WAIT) + ses1->state = SNAT_SESSION_TCP_LAST_ACK; + else if (tcp1->flags == 0 && ses1->state == SNAT_SESSION_UNKNOWN) + ses1->state = SNAT_SESSION_TCP_ESTABLISHED; + + old_port1 = tcp1->src; + tcp1->src = new_port1; + + sum1 = tcp1->checksum; + sum1 = ip_csum_update (sum1, old_addr1.as_u32, new_addr1.as_u32, + ip4_header_t, + dst_address /* changed member */); + sum1 = ip_csum_update (sum1, old_port1, new_port1, + ip4_header_t /* cheat */, + length /* changed member */); + tcp1->checksum = ip_csum_fold(sum1); + } + else + { + ses1->state = SNAT_SESSION_UDP_ACTIVE; + old_port1 = udp1->src_port; + udp1->src_port = new_port1; + udp1->checksum = 0; + } + + switch(ses1->state) + { + case SNAT_SESSION_UDP_ACTIVE: + ses1->expire = now + sm->udp_timeout; + break; + case SNAT_SESSION_TCP_SYN_SENT: + case SNAT_SESSION_TCP_FIN_WAIT: + case SNAT_SESSION_TCP_CLOSE_WAIT: + case SNAT_SESSION_TCP_LAST_ACK: + ses1->expire = now + sm->tcp_transitory_timeout; + break; + case SNAT_SESSION_TCP_ESTABLISHED: + ses1->expire = now + sm->tcp_established_timeout; + break; + } + + trace1: + if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE) + && (b1->flags & VLIB_BUFFER_IS_TRACED))) + { + snat_in2out_trace_t *t = + vlib_add_trace (vm, node, b1, sizeof (*t)); + t->is_slow_path = 0; + t->sw_if_index = sw_if_index1; + t->next_index = next1; + t->session_index = ~0; + if (ses1) + t->session_index = ses1 - dm1->sessions; + } + + pkts_processed += next1 != SNAT_IN2OUT_NEXT_DROP; + + /* verify speculative enqueues, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, + to_next, n_left_to_next, + bi0, bi1, next0, next1); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t * b0; + u32 next0; + u32 sw_if_index0; + ip4_header_t * ip0; + ip_csum_t sum0; + ip4_address_t new_addr0, old_addr0; + u16 old_port0, new_port0, lo_port0, i0; + udp_header_t * udp0; + tcp_header_t * tcp0; + u32 proto0; + snat_det_out_key_t key0; + snat_det_map_t * dm0; + snat_det_session_t * ses0 = 0; + u32 rx_fib_index0; + icmp46_header_t * icmp0; + + /* speculatively enqueue b0 to the current next frame */ + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + next0 = SNAT_IN2OUT_NEXT_LOOKUP; + + ip0 = vlib_buffer_get_current (b0); + udp0 = ip4_next_header (ip0); + tcp0 = (tcp_header_t *) udp0; + + sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX]; + + if (PREDICT_FALSE(ip0->ttl == 1)) + { + vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; + icmp4_error_set_vnet_buffer (b0, ICMP4_time_exceeded, + ICMP4_time_exceeded_ttl_exceeded_in_transit, + 0); + next0 = SNAT_IN2OUT_NEXT_ICMP_ERROR; + goto trace00; + } + + proto0 = ip_proto_to_snat_proto (ip0->protocol); + + if (PREDICT_FALSE(proto0 == SNAT_PROTOCOL_ICMP)) + { + rx_fib_index0 = ip4_fib_table_get_index_for_sw_if_index(sw_if_index0); + icmp0 = (icmp46_header_t *) udp0; + + next0 = icmp_in2out(sm, b0, ip0, icmp0, sw_if_index0, + rx_fib_index0, node, next0, thread_index, + &ses0, &dm0); + goto trace00; + } + + dm0 = snat_det_map_by_user(sm, &ip0->src_address); + if (PREDICT_FALSE(!dm0)) + { + clib_warning("no match for internal host %U", + format_ip4_address, &ip0->src_address); + next0 = SNAT_IN2OUT_NEXT_DROP; + b0->error = node->errors[SNAT_IN2OUT_ERROR_NO_TRANSLATION]; + goto trace00; + } + + snat_det_forward(dm0, &ip0->src_address, &new_addr0, &lo_port0); + + key0.ext_host_addr = ip0->dst_address; + key0.ext_host_port = tcp0->dst; + + ses0 = snat_det_find_ses_by_in(dm0, &ip0->src_address, tcp0->src, key0); + if (PREDICT_FALSE(!ses0)) + { + for (i0 = 0; i0 < dm0->ports_per_host; i0++) + { + key0.out_port = clib_host_to_net_u16 (lo_port0 + + ((i0 + clib_net_to_host_u16 (tcp0->src)) % dm0->ports_per_host)); + + if (snat_det_get_ses_by_out (dm0, &ip0->src_address, key0.as_u64)) + continue; + + ses0 = snat_det_ses_create(dm0, &ip0->src_address, tcp0->src, &key0); + break; + } + if (PREDICT_FALSE(!ses0)) + { + /* too many sessions for user, send ICMP error packet */ + + vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; + icmp4_error_set_vnet_buffer (b0, ICMP4_destination_unreachable, + ICMP4_destination_unreachable_destination_unreachable_host, + 0); + next0 = SNAT_IN2OUT_NEXT_ICMP_ERROR; + goto trace00; + } + } + + new_port0 = ses0->out.out_port; + + old_addr0.as_u32 = ip0->src_address.as_u32; + ip0->src_address.as_u32 = new_addr0.as_u32; + vnet_buffer(b0)->sw_if_index[VLIB_TX] = sm->outside_fib_index; + + sum0 = ip0->checksum; + sum0 = ip_csum_update (sum0, old_addr0.as_u32, new_addr0.as_u32, + ip4_header_t, + src_address /* changed member */); + ip0->checksum = ip_csum_fold (sum0); + + if (PREDICT_TRUE(proto0 == SNAT_PROTOCOL_TCP)) + { + if (tcp0->flags & TCP_FLAG_SYN) + ses0->state = SNAT_SESSION_TCP_SYN_SENT; + else if (tcp0->flags & TCP_FLAG_ACK && ses0->state == SNAT_SESSION_TCP_SYN_SENT) + ses0->state = SNAT_SESSION_TCP_ESTABLISHED; + else if (tcp0->flags & TCP_FLAG_FIN && ses0->state == SNAT_SESSION_TCP_ESTABLISHED) + ses0->state = SNAT_SESSION_TCP_FIN_WAIT; + else if (tcp0->flags & TCP_FLAG_ACK && ses0->state == SNAT_SESSION_TCP_FIN_WAIT) + snat_det_ses_close(dm0, ses0); + else if (tcp0->flags & TCP_FLAG_FIN && ses0->state == SNAT_SESSION_TCP_CLOSE_WAIT) + ses0->state = SNAT_SESSION_TCP_LAST_ACK; + else if (tcp0->flags == 0 && ses0->state == SNAT_SESSION_UNKNOWN) + ses0->state = SNAT_SESSION_TCP_ESTABLISHED; + + old_port0 = tcp0->src; + tcp0->src = new_port0; + + sum0 = tcp0->checksum; + sum0 = ip_csum_update (sum0, old_addr0.as_u32, new_addr0.as_u32, + ip4_header_t, + dst_address /* changed member */); + sum0 = ip_csum_update (sum0, old_port0, new_port0, + ip4_header_t /* cheat */, + length /* changed member */); + tcp0->checksum = ip_csum_fold(sum0); + } + else + { + ses0->state = SNAT_SESSION_UDP_ACTIVE; + old_port0 = udp0->src_port; + udp0->src_port = new_port0; + udp0->checksum = 0; + } + + switch(ses0->state) + { + case SNAT_SESSION_UDP_ACTIVE: + ses0->expire = now + sm->udp_timeout; + break; + case SNAT_SESSION_TCP_SYN_SENT: + case SNAT_SESSION_TCP_FIN_WAIT: + case SNAT_SESSION_TCP_CLOSE_WAIT: + case SNAT_SESSION_TCP_LAST_ACK: + ses0->expire = now + sm->tcp_transitory_timeout; + break; + case SNAT_SESSION_TCP_ESTABLISHED: + ses0->expire = now + sm->tcp_established_timeout; + break; + } + + trace00: + if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE) + && (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + snat_in2out_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->is_slow_path = 0; + t->sw_if_index = sw_if_index0; + t->next_index = next0; + t->session_index = ~0; + if (ses0) + t->session_index = ses0 - dm0->sessions; + } + + pkts_processed += next0 != SNAT_IN2OUT_NEXT_DROP; + + /* verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + vlib_node_increment_counter (vm, snat_det_in2out_node.index, + SNAT_IN2OUT_ERROR_IN2OUT_PACKETS, + pkts_processed); + return frame->n_vectors; +} + +VLIB_REGISTER_NODE (snat_det_in2out_node) = { + .function = snat_det_in2out_node_fn, + .name = "nat44-det-in2out", + .vector_size = sizeof (u32), + .format_trace = format_snat_in2out_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .n_errors = ARRAY_LEN(snat_in2out_error_strings), + .error_strings = snat_in2out_error_strings, + + .runtime_data_bytes = sizeof (snat_runtime_t), + + .n_next_nodes = 3, + + /* edit / add dispositions here */ + .next_nodes = { + [SNAT_IN2OUT_NEXT_DROP] = "error-drop", + [SNAT_IN2OUT_NEXT_LOOKUP] = "ip4-lookup", + [SNAT_IN2OUT_NEXT_ICMP_ERROR] = "ip4-icmp-error", + }, +}; + +VLIB_NODE_FUNCTION_MULTIARCH (snat_det_in2out_node, snat_det_in2out_node_fn); + +/** + * Get address and port values to be used for ICMP packet translation + * and create session if needed + * + * @param[in,out] sm NAT main + * @param[in,out] node NAT node runtime + * @param[in] thread_index thread index + * @param[in,out] b0 buffer containing packet to be translated + * @param[out] p_proto protocol used for matching + * @param[out] p_value address and port after NAT translation + * @param[out] p_dont_translate if packet should not be translated + * @param d optional parameter + * @param e optional parameter + */ +u32 icmp_match_in2out_det(snat_main_t *sm, vlib_node_runtime_t *node, + u32 thread_index, vlib_buffer_t *b0, + ip4_header_t *ip0, u8 *p_proto, + snat_session_key_t *p_value, + u8 *p_dont_translate, void *d, void *e) +{ + icmp46_header_t *icmp0; + u32 sw_if_index0; + u32 rx_fib_index0; + u8 protocol; + snat_det_out_key_t key0; + u8 dont_translate = 0; + u32 next0 = ~0; + icmp_echo_header_t *echo0, *inner_echo0 = 0; + ip4_header_t *inner_ip0; + void *l4_header = 0; + icmp46_header_t *inner_icmp0; + snat_det_map_t * dm0 = 0; + ip4_address_t new_addr0; + u16 lo_port0, i0; + snat_det_session_t * ses0 = 0; + ip4_address_t in_addr; + u16 in_port; + + icmp0 = (icmp46_header_t *) ip4_next_header (ip0); + echo0 = (icmp_echo_header_t *)(icmp0+1); + sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX]; + rx_fib_index0 = ip4_fib_table_get_index_for_sw_if_index (sw_if_index0); + + if (!icmp_is_error_message (icmp0)) + { + protocol = SNAT_PROTOCOL_ICMP; + in_addr = ip0->src_address; + in_port = echo0->identifier; + } + else + { + inner_ip0 = (ip4_header_t *)(echo0+1); + l4_header = ip4_next_header (inner_ip0); + protocol = ip_proto_to_snat_proto (inner_ip0->protocol); + in_addr = inner_ip0->dst_address; + switch (protocol) + { + case SNAT_PROTOCOL_ICMP: + inner_icmp0 = (icmp46_header_t*)l4_header; + inner_echo0 = (icmp_echo_header_t *)(inner_icmp0+1); + in_port = inner_echo0->identifier; + break; + case SNAT_PROTOCOL_UDP: + case SNAT_PROTOCOL_TCP: + in_port = ((tcp_udp_header_t*)l4_header)->dst_port; + break; + default: + b0->error = node->errors[SNAT_IN2OUT_ERROR_UNSUPPORTED_PROTOCOL]; + next0 = SNAT_IN2OUT_NEXT_DROP; + goto out; + } + } + + dm0 = snat_det_map_by_user(sm, &in_addr); + if (PREDICT_FALSE(!dm0)) + { + clib_warning("no match for internal host %U", + format_ip4_address, &in_addr); + if (PREDICT_FALSE(snat_not_translate_fast(sm, node, sw_if_index0, ip0, + IP_PROTOCOL_ICMP, rx_fib_index0))) + { + dont_translate = 1; + goto out; + } + next0 = SNAT_IN2OUT_NEXT_DROP; + b0->error = node->errors[SNAT_IN2OUT_ERROR_NO_TRANSLATION]; + goto out; + } + + snat_det_forward(dm0, &in_addr, &new_addr0, &lo_port0); + + key0.ext_host_addr = ip0->dst_address; + key0.ext_host_port = 0; + + ses0 = snat_det_find_ses_by_in(dm0, &in_addr, in_port, key0); + if (PREDICT_FALSE(!ses0)) + { + if (PREDICT_FALSE(snat_not_translate_fast(sm, node, sw_if_index0, ip0, + IP_PROTOCOL_ICMP, rx_fib_index0))) + { + dont_translate = 1; + goto out; + } + if (icmp0->type != ICMP4_echo_request) + { + b0->error = node->errors[SNAT_IN2OUT_ERROR_BAD_ICMP_TYPE]; + next0 = SNAT_IN2OUT_NEXT_DROP; + goto out; + } + for (i0 = 0; i0 < dm0->ports_per_host; i0++) + { + key0.out_port = clib_host_to_net_u16 (lo_port0 + + ((i0 + clib_net_to_host_u16 (echo0->identifier)) % dm0->ports_per_host)); + + if (snat_det_get_ses_by_out (dm0, &in_addr, key0.as_u64)) + continue; + + ses0 = snat_det_ses_create(dm0, &in_addr, echo0->identifier, &key0); + break; + } + if (PREDICT_FALSE(!ses0)) + { + next0 = SNAT_IN2OUT_NEXT_DROP; + b0->error = node->errors[SNAT_IN2OUT_ERROR_OUT_OF_PORTS]; + goto out; + } + } + + if (PREDICT_FALSE(icmp0->type != ICMP4_echo_request && + !icmp_is_error_message (icmp0))) + { + b0->error = node->errors[SNAT_IN2OUT_ERROR_BAD_ICMP_TYPE]; + next0 = SNAT_IN2OUT_NEXT_DROP; + goto out; + } + + u32 now = (u32) vlib_time_now (sm->vlib_main); + + ses0->state = SNAT_SESSION_ICMP_ACTIVE; + ses0->expire = now + sm->icmp_timeout; + +out: + *p_proto = protocol; + if (ses0) + { + p_value->addr = new_addr0; + p_value->fib_index = sm->outside_fib_index; + p_value->port = ses0->out.out_port; + } + *p_dont_translate = dont_translate; + if (d) + *(snat_det_session_t**)d = ses0; + if (e) + *(snat_det_map_t**)e = dm0; + return next0; +} + +/**********************/ +/*** worker handoff ***/ +/**********************/ +static inline uword +snat_in2out_worker_handoff_fn_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame, + u8 is_output) +{ + snat_main_t *sm = &snat_main; + vlib_thread_main_t *tm = vlib_get_thread_main (); + u32 n_left_from, *from, *to_next = 0; + static __thread vlib_frame_queue_elt_t **handoff_queue_elt_by_worker_index; + static __thread vlib_frame_queue_t **congested_handoff_queue_by_worker_index + = 0; + vlib_frame_queue_elt_t *hf = 0; + vlib_frame_t *f = 0; + int i; + u32 n_left_to_next_worker = 0, *to_next_worker = 0; + u32 next_worker_index = 0; + u32 current_worker_index = ~0; + u32 thread_index = vlib_get_thread_index (); + u32 fq_index; + u32 to_node_index; + + ASSERT (vec_len (sm->workers)); + + if (is_output) + { + fq_index = sm->fq_in2out_output_index; + to_node_index = sm->in2out_output_node_index; + } + else + { + fq_index = sm->fq_in2out_index; + to_node_index = sm->in2out_node_index; + } + + if (PREDICT_FALSE (handoff_queue_elt_by_worker_index == 0)) + { + vec_validate (handoff_queue_elt_by_worker_index, tm->n_vlib_mains - 1); + + vec_validate_init_empty (congested_handoff_queue_by_worker_index, + sm->first_worker_index + sm->num_workers - 1, + (vlib_frame_queue_t *) (~0)); + } + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + + while (n_left_from > 0) + { + u32 bi0; + vlib_buffer_t *b0; + u32 sw_if_index0; + u32 rx_fib_index0; + ip4_header_t * ip0; + u8 do_handoff; + + bi0 = from[0]; + from += 1; + n_left_from -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX]; + rx_fib_index0 = ip4_fib_table_get_index_for_sw_if_index(sw_if_index0); + + ip0 = vlib_buffer_get_current (b0); + + next_worker_index = sm->worker_in2out_cb(ip0, rx_fib_index0); + + if (PREDICT_FALSE (next_worker_index != thread_index)) + { + do_handoff = 1; + + if (next_worker_index != current_worker_index) + { + if (hf) + hf->n_vectors = VLIB_FRAME_SIZE - n_left_to_next_worker; + + hf = vlib_get_worker_handoff_queue_elt (fq_index, + next_worker_index, + handoff_queue_elt_by_worker_index); + + n_left_to_next_worker = VLIB_FRAME_SIZE - hf->n_vectors; + to_next_worker = &hf->buffer_index[hf->n_vectors]; + current_worker_index = next_worker_index; + } + + /* enqueue to correct worker thread */ + to_next_worker[0] = bi0; + to_next_worker++; + n_left_to_next_worker--; + + if (n_left_to_next_worker == 0) + { + hf->n_vectors = VLIB_FRAME_SIZE; + vlib_put_frame_queue_elt (hf); + current_worker_index = ~0; + handoff_queue_elt_by_worker_index[next_worker_index] = 0; + hf = 0; + } + } + else + { + do_handoff = 0; + /* if this is 1st frame */ + if (!f) + { + f = vlib_get_frame_to_node (vm, to_node_index); + to_next = vlib_frame_vector_args (f); + } + + to_next[0] = bi0; + to_next += 1; + f->n_vectors++; + } + + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) + && (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + snat_in2out_worker_handoff_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->next_worker_index = next_worker_index; + t->do_handoff = do_handoff; + } + } + + if (f) + vlib_put_frame_to_node (vm, to_node_index, f); + + if (hf) + hf->n_vectors = VLIB_FRAME_SIZE - n_left_to_next_worker; + + /* Ship frames to the worker nodes */ + for (i = 0; i < vec_len (handoff_queue_elt_by_worker_index); i++) + { + if (handoff_queue_elt_by_worker_index[i]) + { + hf = handoff_queue_elt_by_worker_index[i]; + /* + * It works better to let the handoff node + * rate-adapt, always ship the handoff queue element. + */ + if (1 || hf->n_vectors == hf->last_n_vectors) + { + vlib_put_frame_queue_elt (hf); + handoff_queue_elt_by_worker_index[i] = 0; + } + else + hf->last_n_vectors = hf->n_vectors; + } + congested_handoff_queue_by_worker_index[i] = + (vlib_frame_queue_t *) (~0); + } + hf = 0; + current_worker_index = ~0; + return frame->n_vectors; +} + +static uword +snat_in2out_worker_handoff_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return snat_in2out_worker_handoff_fn_inline (vm, node, frame, 0); +} + +VLIB_REGISTER_NODE (snat_in2out_worker_handoff_node) = { + .function = snat_in2out_worker_handoff_fn, + .name = "nat44-in2out-worker-handoff", + .vector_size = sizeof (u32), + .format_trace = format_snat_in2out_worker_handoff_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .n_next_nodes = 1, + + .next_nodes = { + [0] = "error-drop", + }, +}; + +VLIB_NODE_FUNCTION_MULTIARCH (snat_in2out_worker_handoff_node, + snat_in2out_worker_handoff_fn); + +static uword +snat_in2out_output_worker_handoff_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return snat_in2out_worker_handoff_fn_inline (vm, node, frame, 1); +} + +VLIB_REGISTER_NODE (snat_in2out_output_worker_handoff_node) = { + .function = snat_in2out_output_worker_handoff_fn, + .name = "nat44-in2out-output-worker-handoff", + .vector_size = sizeof (u32), + .format_trace = format_snat_in2out_worker_handoff_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .n_next_nodes = 1, + + .next_nodes = { + [0] = "error-drop", + }, +}; + +VLIB_NODE_FUNCTION_MULTIARCH (snat_in2out_output_worker_handoff_node, + snat_in2out_output_worker_handoff_fn); + +static_always_inline int +is_hairpinning (snat_main_t *sm, ip4_address_t * dst_addr) +{ + snat_address_t * ap; + clib_bihash_kv_8_8_t kv, value; + snat_session_key_t m_key; + + vec_foreach (ap, sm->addresses) + { + if (ap->addr.as_u32 == dst_addr->as_u32) + return 1; + } + + m_key.addr.as_u32 = dst_addr->as_u32; + m_key.fib_index = sm->outside_fib_index; + m_key.port = 0; + m_key.protocol = 0; + kv.key = m_key.as_u64; + if (!clib_bihash_search_8_8 (&sm->static_mapping_by_external, &kv, &value)) + return 1; + + return 0; +} + +static uword +snat_hairpin_dst_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + u32 n_left_from, * from, * to_next; + snat_in2out_next_t next_index; + u32 pkts_processed = 0; + snat_main_t * sm = &snat_main; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, + to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t * b0; + u32 next0; + ip4_header_t * ip0; + u32 proto0; + + /* speculatively enqueue b0 to the current next frame */ + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + next0 = SNAT_IN2OUT_NEXT_LOOKUP; + ip0 = vlib_buffer_get_current (b0); + + proto0 = ip_proto_to_snat_proto (ip0->protocol); + + vnet_buffer (b0)->snat.flags = 0; + if (PREDICT_FALSE (is_hairpinning (sm, &ip0->dst_address))) + { + if (proto0 == SNAT_PROTOCOL_TCP || proto0 == SNAT_PROTOCOL_UDP) + { + udp_header_t * udp0 = ip4_next_header (ip0); + tcp_header_t * tcp0 = (tcp_header_t *) udp0; + + snat_hairpinning (sm, b0, ip0, udp0, tcp0, proto0); + } + else if (proto0 == SNAT_PROTOCOL_ICMP) + { + icmp46_header_t * icmp0 = ip4_next_header (ip0); + + snat_icmp_hairpinning (sm, b0, ip0, icmp0); + } + else + { + snat_hairpinning_unknown_proto (sm, b0, ip0); + } + + vnet_buffer (b0)->snat.flags = SNAT_FLAG_HAIRPINNING; + clib_warning("is hairpinning"); + } + + pkts_processed += next0 != SNAT_IN2OUT_NEXT_DROP; + + /* verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + vlib_node_increment_counter (vm, snat_hairpin_dst_node.index, + SNAT_IN2OUT_ERROR_IN2OUT_PACKETS, + pkts_processed); + return frame->n_vectors; +} + +VLIB_REGISTER_NODE (snat_hairpin_dst_node) = { + .function = snat_hairpin_dst_fn, + .name = "nat44-hairpin-dst", + .vector_size = sizeof (u32), + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN(snat_in2out_error_strings), + .error_strings = snat_in2out_error_strings, + .n_next_nodes = 2, + .next_nodes = { + [SNAT_IN2OUT_NEXT_DROP] = "error-drop", + [SNAT_IN2OUT_NEXT_LOOKUP] = "ip4-lookup", + }, +}; + +VLIB_NODE_FUNCTION_MULTIARCH (snat_hairpin_dst_node, + snat_hairpin_dst_fn); + +static uword +snat_hairpin_src_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + u32 n_left_from, * from, * to_next; + snat_in2out_next_t next_index; + u32 pkts_processed = 0; + snat_main_t *sm = &snat_main; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, + to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t * b0; + u32 next0; + snat_interface_t *i; + u32 sw_if_index0; + + /* speculatively enqueue b0 to the current next frame */ + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX]; + next0 = SNAT_HAIRPIN_SRC_NEXT_INTERFACE_OUTPUT; + + pool_foreach (i, sm->output_feature_interfaces, + ({ + /* Only packets from NAT inside interface */ + if ((i->is_inside == 1) && (sw_if_index0 == i->sw_if_index)) + { + if (PREDICT_FALSE ((vnet_buffer (b0)->snat.flags) & + SNAT_FLAG_HAIRPINNING)) + { + if (PREDICT_TRUE (sm->num_workers > 1)) + next0 = SNAT_HAIRPIN_SRC_NEXT_SNAT_IN2OUT_WH; + else + next0 = SNAT_HAIRPIN_SRC_NEXT_SNAT_IN2OUT; + } + break; + } + })); + + pkts_processed += next0 != SNAT_IN2OUT_NEXT_DROP; + + /* verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + vlib_node_increment_counter (vm, snat_hairpin_src_node.index, + SNAT_IN2OUT_ERROR_IN2OUT_PACKETS, + pkts_processed); + return frame->n_vectors; +} + +VLIB_REGISTER_NODE (snat_hairpin_src_node) = { + .function = snat_hairpin_src_fn, + .name = "nat44-hairpin-src", + .vector_size = sizeof (u32), + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN(snat_in2out_error_strings), + .error_strings = snat_in2out_error_strings, + .n_next_nodes = SNAT_HAIRPIN_SRC_N_NEXT, + .next_nodes = { + [SNAT_HAIRPIN_SRC_NEXT_DROP] = "error-drop", + [SNAT_HAIRPIN_SRC_NEXT_SNAT_IN2OUT] = "nat44-in2out-output", + [SNAT_HAIRPIN_SRC_NEXT_INTERFACE_OUTPUT] = "interface-output", + [SNAT_HAIRPIN_SRC_NEXT_SNAT_IN2OUT_WH] = "nat44-in2out-output-worker-handoff", + }, +}; + +VLIB_NODE_FUNCTION_MULTIARCH (snat_hairpin_src_node, + snat_hairpin_src_fn); + +static uword +snat_in2out_fast_static_map_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + u32 n_left_from, * from, * to_next; + snat_in2out_next_t next_index; + u32 pkts_processed = 0; + snat_main_t * sm = &snat_main; + u32 stats_node_index; + + stats_node_index = snat_in2out_fast_node.index; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, + to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t * b0; + u32 next0; + u32 sw_if_index0; + ip4_header_t * ip0; + ip_csum_t sum0; + u32 new_addr0, old_addr0; + u16 old_port0, new_port0; + udp_header_t * udp0; + tcp_header_t * tcp0; + icmp46_header_t * icmp0; + snat_session_key_t key0, sm0; + u32 proto0; + u32 rx_fib_index0; + + /* speculatively enqueue b0 to the current next frame */ + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + next0 = SNAT_IN2OUT_NEXT_LOOKUP; + + ip0 = vlib_buffer_get_current (b0); + udp0 = ip4_next_header (ip0); + tcp0 = (tcp_header_t *) udp0; + icmp0 = (icmp46_header_t *) udp0; + + sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX]; + rx_fib_index0 = ip4_fib_table_get_index_for_sw_if_index(sw_if_index0); + + if (PREDICT_FALSE(ip0->ttl == 1)) + { + vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; + icmp4_error_set_vnet_buffer (b0, ICMP4_time_exceeded, + ICMP4_time_exceeded_ttl_exceeded_in_transit, + 0); + next0 = SNAT_IN2OUT_NEXT_ICMP_ERROR; + goto trace0; + } + + proto0 = ip_proto_to_snat_proto (ip0->protocol); + + if (PREDICT_FALSE (proto0 == ~0)) + goto trace0; + + if (PREDICT_FALSE (proto0 == SNAT_PROTOCOL_ICMP)) + { + next0 = icmp_in2out(sm, b0, ip0, icmp0, sw_if_index0, + rx_fib_index0, node, next0, ~0, 0, 0); + goto trace0; + } + + key0.addr = ip0->src_address; + key0.protocol = proto0; + key0.port = udp0->src_port; + key0.fib_index = rx_fib_index0; + + if (snat_static_mapping_match(sm, key0, &sm0, 0, 0)) + { + b0->error = node->errors[SNAT_IN2OUT_ERROR_NO_TRANSLATION]; + next0= SNAT_IN2OUT_NEXT_DROP; + goto trace0; + } + + new_addr0 = sm0.addr.as_u32; + new_port0 = sm0.port; + vnet_buffer(b0)->sw_if_index[VLIB_TX] = sm0.fib_index; + old_addr0 = ip0->src_address.as_u32; + ip0->src_address.as_u32 = new_addr0; + + sum0 = ip0->checksum; + sum0 = ip_csum_update (sum0, old_addr0, new_addr0, + ip4_header_t, + src_address /* changed member */); + ip0->checksum = ip_csum_fold (sum0); + + if (PREDICT_FALSE(new_port0 != udp0->dst_port)) + { + if (PREDICT_TRUE(proto0 == SNAT_PROTOCOL_TCP)) + { + old_port0 = tcp0->src_port; + tcp0->src_port = new_port0; + + sum0 = tcp0->checksum; + sum0 = ip_csum_update (sum0, old_addr0, new_addr0, + ip4_header_t, + dst_address /* changed member */); + sum0 = ip_csum_update (sum0, old_port0, new_port0, + ip4_header_t /* cheat */, + length /* changed member */); + tcp0->checksum = ip_csum_fold(sum0); + } + else + { + old_port0 = udp0->src_port; + udp0->src_port = new_port0; + udp0->checksum = 0; + } + } + else + { + if (PREDICT_TRUE(proto0 == SNAT_PROTOCOL_TCP)) + { + sum0 = tcp0->checksum; + sum0 = ip_csum_update (sum0, old_addr0, new_addr0, + ip4_header_t, + dst_address /* changed member */); + tcp0->checksum = ip_csum_fold(sum0); + } + } + + /* Hairpinning */ + snat_hairpinning (sm, b0, ip0, udp0, tcp0, proto0); + + trace0: + if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE) + && (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + snat_in2out_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->sw_if_index = sw_if_index0; + t->next_index = next0; + } + + pkts_processed += next0 != SNAT_IN2OUT_NEXT_DROP; + + /* verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + vlib_node_increment_counter (vm, stats_node_index, + SNAT_IN2OUT_ERROR_IN2OUT_PACKETS, + pkts_processed); + return frame->n_vectors; +} + + +VLIB_REGISTER_NODE (snat_in2out_fast_node) = { + .function = snat_in2out_fast_static_map_fn, + .name = "nat44-in2out-fast", + .vector_size = sizeof (u32), + .format_trace = format_snat_in2out_fast_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .n_errors = ARRAY_LEN(snat_in2out_error_strings), + .error_strings = snat_in2out_error_strings, + + .runtime_data_bytes = sizeof (snat_runtime_t), + + .n_next_nodes = SNAT_IN2OUT_N_NEXT, + + /* edit / add dispositions here */ + .next_nodes = { + [SNAT_IN2OUT_NEXT_DROP] = "error-drop", + [SNAT_IN2OUT_NEXT_LOOKUP] = "ip4-lookup", + [SNAT_IN2OUT_NEXT_SLOW_PATH] = "nat44-in2out-slowpath", + [SNAT_IN2OUT_NEXT_ICMP_ERROR] = "ip4-icmp-error", + }, +}; + +VLIB_NODE_FUNCTION_MULTIARCH (snat_in2out_fast_node, snat_in2out_fast_static_map_fn); diff --git a/src/plugins/nat/nat.api b/src/plugins/nat/nat.api new file mode 100644 index 00000000..d7a4a9ef --- /dev/null +++ b/src/plugins/nat/nat.api @@ -0,0 +1,1546 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @file nat.api + * @brief VPP control-plane API messages. + * + * This file defines VPP control-plane API messages which are generally + * called through a shared memory interface. + */ + +/* + * Old "snat" APIs, will be deprecated after 17.10 + */ + +/** \brief Add/del NAT44 address range + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param is_ip4 - 1 if address type is IPv4 + @param first_ip_address - first IP address + @param last_ip_address - last IP address + @param vrf_id - VRF id of tenant, ~0 means independent of VRF + @param is_add - 1 if add, 0 if delete +*/ +autoreply define snat_add_address_range { + u32 client_index; + u32 context; + u8 is_ip4; + u8 first_ip_address[16]; + u8 last_ip_address[16]; + u32 vrf_id; + u8 is_add; +}; + +/** \brief Dump NAT44 addresses + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request +*/ +define snat_address_dump { + u32 client_index; + u32 context; +}; + +/** \brief NAT44 address details response + @param context - sender context, to match reply w/ request + @param is_ip4 - 1 if address type is IPv4 + @param ip_address - IP address + @param vrf_id - VRF id of tenant, ~0 means independent of VRF +*/ +define snat_address_details { + u32 context; + u8 is_ip4; + u8 ip_address[16]; + u32 vrf_id; +}; + +/** \brief Enable/disable NAT44 feature on the interface + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param is_add - 1 if add, 0 if delete + @param is_inside - 1 if inside, 0 if outside + @param sw_if_index - software index of the interface +*/ +autoreply define snat_interface_add_del_feature { + u32 client_index; + u32 context; + u8 is_add; + u8 is_inside; + u32 sw_if_index; +}; + +/** \brief Dump interfaces with NAT44 feature + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request +*/ +define snat_interface_dump { + u32 client_index; + u32 context; +}; + +/** \brief NAT44 interface details response + @param context - sender context, to match reply w/ request + @param is_inside - 1 if inside, 0 if outside + @param sw_if_index - software index of the interface +*/ +define snat_interface_details { + u32 context; + u8 is_inside; + u32 sw_if_index; +}; + +/** \brief Enable/disbale NAT44 as an interface output feature (postrouting + in2out translation) + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param is_add - 1 if add, 0 if delete + @param is_inside - 1 if inside, 0 if outside + @param sw_if_index - software index of the interface +*/ +autoreply define snat_interface_add_del_output_feature { + u32 client_index; + u32 context; + u8 is_add; + u8 is_inside; + u32 sw_if_index; +}; + +/** \brief Dump interfaces with NAT44 output feature + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request +*/ +define snat_interface_output_feature_dump { + u32 client_index; + u32 context; +}; + +/** \brief NAT44 interface with output feature details response + @param context - sender context, to match reply w/ request + @param is_inside - 1 if inside, 0 if outside + @param sw_if_index - software index of the interface +*/ +define snat_interface_output_feature_details { + u32 context; + u8 is_inside; + u32 sw_if_index; +}; + +/** \brief Add/delete NAT44 static mapping + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param is_add - 1 if add, 0 if delete + @param is_ip4 - 1 if address type is IPv4 + @param addr_only - 1 if address only mapping + @param local_ip_address - local IP address + @param external_ip_address - external IP address + @param protocol - IP protocol + @param local_port - local port number + @param external_port - external port number + @param external_sw_if_index - external interface (if set + external_ip_address is ignored, ~0 means not + used) + @param vfr_id - VRF ID +*/ +autoreply define snat_add_static_mapping { + u32 client_index; + u32 context; + u8 is_add; + u8 is_ip4; + u8 addr_only; + u8 local_ip_address[16]; + u8 external_ip_address[16]; + u8 protocol; + u16 local_port; + u16 external_port; + u32 external_sw_if_index; + u32 vrf_id; +}; + +/** \brief Dump NAT44 static mappings + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request +*/ +define snat_static_mapping_dump { + u32 client_index; + u32 context; +}; + +/** \brief NAT44 static mapping details response + @param context - sender context, to match reply w/ request + @param is_ip4 - 1 if address type is IPv4 + @param addr_only - 1 if address only mapping + @param local_ip_address - local IP address + @param external_ip_address - external IP address + @param protocol - IP protocol + @param local_port - local port number + @param external_port - external port number + @param external_sw_if_index - external interface + @param vfr_id - VRF ID +*/ +define snat_static_mapping_details { + u32 context; + u8 is_ip4; + u8 addr_only; + u8 local_ip_address[16]; + u8 external_ip_address[16]; + u8 protocol; + u16 local_port; + u16 external_port; + u32 external_sw_if_index; + u32 vrf_id; +}; + +/** \brief Control ping from client to api server request + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request +*/ +define snat_control_ping +{ + u32 client_index; + u32 context; +}; + +/** \brief Control ping from the client to the server response + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param retval - return code for the request + @param vpe_pid - the pid of the vpe, returned by the server +*/ +define snat_control_ping_reply +{ + u32 context; + i32 retval; + u32 client_index; + u32 vpe_pid; +}; + +/** \brief Show NAT plugin startup config + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request +*/ +define snat_show_config +{ + u32 client_index; + u32 context; +}; + +/** \brief Show NAT plugin startup config reply + @param context - sender context, to match reply w/ request + @param retval - return code for the request + @param static_mapping_only - if 1 dynamic translations disabled + @param static_mapping_connection_tracking - if 1 create session data + @param deterministic - if 1 deterministic mapping + @param translation_buckets - number of translation hash buckets + @param translation_memory_size - translation hash memory size + @param user_buckets - number of user hash buckets + @param user_memory_size - user hash memory size + @param max_translations_per_user - maximum number of translations per user + @param outside_vrf_id - outside VRF id + @param inside_vrf_id - default inside VRF id +*/ +define snat_show_config_reply +{ + u32 context; + i32 retval; + u8 static_mapping_only; + u8 static_mapping_connection_tracking; + u8 deterministic; + u32 translation_buckets; + u32 translation_memory_size; + u32 user_buckets; + u32 user_memory_size; + u32 max_translations_per_user; + u32 outside_vrf_id; + u32 inside_vrf_id; +}; + +/** \brief Set NAT workers + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param worker_mask - NAT workers mask +*/ +autoreply define snat_set_workers { + u32 client_index; + u32 context; + u64 worker_mask; +}; + +/** \brief Dump NAT workers + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request +*/ +define snat_worker_dump { + u32 client_index; + u32 context; +}; + +/** \brief NAT workers details response + @param context - sender context, to match reply w/ request + @param worker_index - worker index + @param lcore_id - lcore ID + @param name - worker name +*/ +define snat_worker_details { + u32 context; + u32 worker_index; + u32 lcore_id; + u8 name[64]; +}; + +/** \brief Add/delete NAT44 pool address from specific interfce + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param is_add - 1 if add, 0 if delete + @param sw_if_index - software index of the interface +*/ +autoreply define snat_add_del_interface_addr { + u32 client_index; + u32 context; + u8 is_add; + u8 is_inside; + u32 sw_if_index; +}; + +/** \brief Dump NAT44 pool addresses interfaces + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request +*/ +define snat_interface_addr_dump { + u32 client_index; + u32 context; +}; + +/** \brief NAT44 pool addresses interfaces details response + @param context - sender context, to match reply w/ request + @param sw_if_index - software index of the interface +*/ +define snat_interface_addr_details { + u32 context; + u32 sw_if_index; +}; + +/** \brief Enable/disable NAT IPFIX logging + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param domain_id - observation domain ID + @param src_port - source port number + @param enable - 1 if enable, 0 if disable +*/ +autoreply define snat_ipfix_enable_disable { + u32 client_index; + u32 context; + u32 domain_id; + u16 src_port; + u8 enable; +}; + +/** \brief Dump NAT44 users + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request +*/ +define snat_user_dump { + u32 client_index; + u32 context; +}; + +/** \brief NAT44 users response + @param context - sender context, to match reply w/ request + @vrf_id - VRF ID + @param is_ip4 - 1 if address type is IPv4 + @param ip_adress - IP address + @param nsessions - number of dynamic sessions + @param nstaticsessions - number of static sessions +*/ +define snat_user_details { + u32 context; + u32 vrf_id; + u8 is_ip4; + u8 ip_address[16]; + u32 nsessions; + u32 nstaticsessions; +}; + +/** \brief NAT44 user's sessions + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param is_ip4 - 1 if address type is IPv4 + @param user_ip - IP address of the user to dump + @param vrf_id - VRF_ID +*/ +define snat_user_session_dump { + u32 client_index; + u32 context; + u8 is_ip4; + u8 ip_address[16]; + u32 vrf_id; +}; + +/** \brief NAT44 user's sessions response + @param context - sender context, to match reply w/ request + @param is_ip4 - 1 if address type is IPv4 + @param outside_ip_address - outside IP address + @param outside_port - outside port + @param inside_ip_address - inside IP address + @param inside_port - inside port + @param protocol - protocol + @param is_static - 1 if session is static + @param last_heard - last heard timer + @param total_bytes - count of bytes sent through session + @param total_pkts - count of pakets sent through session +*/ +define snat_user_session_details { + u32 context; + u8 is_ip4; + u8 outside_ip_address[16]; + u16 outside_port; + u8 inside_ip_address[16]; + u16 inside_port; + u16 protocol; + u8 is_static; + u64 last_heard; + u64 total_bytes; + u32 total_pkts; +}; + +/** \brief Add/delete NAT deterministic mapping + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param is_add - 1 if add, 0 if delete + @param is_ip4 - 1 if address type is IPv4 + @param in_addr - inside IP address + @param in_plen - inside IP address prefix length + @param out_addr - outside IP address + @param out_addr - outside IP address prefix length +*/ +autoreply define snat_add_det_map { + u32 client_index; + u32 context; + u8 is_add; + u8 is_ip4; + u8 addr_only; + u8 in_addr[16]; + u8 in_plen; + u8 out_addr[16]; + u8 out_plen; +}; + +/** \brief Get outside address and port range from inside address + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param is_ip4 - 1 if address type is IPv4 + @param in_addr - inside IP address +*/ +define snat_det_forward { + u32 client_index; + u32 context; + u8 is_ip4; + u8 in_addr[16]; +}; + +/** \brief Get outside address and port range from inside address + @param context - sender context, to match reply w/ request + @param retval - return code + @param out_port_lo - outside port range start + @param out_port_hi - outside port range end + @param is_ip4 - 1 if address type is IPv4 + @param out_addr - outside IP address +*/ +define snat_det_forward_reply { + u32 context; + i32 retval; + u16 out_port_lo; + u16 out_port_hi; + u8 is_ip4; + u8 out_addr[16]; +}; + +/** \brief Get inside address from outside address and port + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param out_port - outside port + @param is_ip4 - 1 if address type is IPv4 + @param out_addr - outside IP address +*/ +define snat_det_reverse { + u32 client_index; + u32 context; + u16 out_port; + u8 is_ip4; + u8 out_addr[16]; +}; + +/** \brief Get inside address from outside address and port reply + @param context - sender context, to match reply w/ request + @param retval - return code + @param is_ip4 - 1 if address type is IPv4 + @param in_addr - inside IP address +*/ +define snat_det_reverse_reply { + u32 context; + i32 retval; + u8 is_ip4; + u8 in_addr[16]; +}; + +/** \brief Dump NAT deterministic mappings + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request +*/ +define snat_det_map_dump { + u32 client_index; + u32 context; +}; + +/** \brief NAT users response + @param context - sender context, to match reply w/ request + @param is_ip4 - 1 if address type is IPv4 + @param in_addr - inside IP address + @param in_plen - inside IP address prefix length + @param out_addr - outside IP address + @param out_plen - outside IP address prefix length + @param sharing_ratio - outside to inside address sharing ratio + @param ports_per_host - number of ports available to a host + @param ses_num - number of sessions belonging to this mapping +*/ +define snat_det_map_details { + u32 context; + u8 is_ip4; + u8 in_addr[16]; + u8 in_plen; + u8 out_addr[16]; + u8 out_plen; + u32 sharing_ratio; + u16 ports_per_host; + u32 ses_num; +}; + +/** \brief Set values of timeouts for deterministic NAT (seconds, 0 = default) + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param udp - UDP timeout (default 300sec) + @param tcp_established - TCP established timeout (default 7440sec) + @param tcp_transitory - TCP transitory timeout (default 240sec) + @param icmp - ICMP timeout (default 60sec) +*/ +autoreply define snat_det_set_timeouts { + u32 client_index; + u32 context; + u32 udp; + u32 tcp_established; + u32 tcp_transitory; + u32 icmp; +}; + +/** \brief Get values of timeouts for deterministic NAT (seconds) + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request +*/ +define snat_det_get_timeouts { + u32 client_index; + u32 context; +}; + +/** \brief Get values of timeouts for deterministic NAT reply + @param context - sender context, to match reply w/ request + @param retval - return code + @param udp - UDP timeout (default 300sec) + @param tcp_established - TCP established timeout (default 7440sec) + @param tcp_transitory - TCP transitory timeout (default 240sec) + @param icmp - ICMP timeout (default 60sec) +*/ +define snat_det_get_timeouts_reply { + u32 context; + i32 retval; + u32 udp; + u32 tcp_established; + u32 tcp_transitory; + u32 icmp; +}; + +/** \brief Close deterministic NAT session by outside address and port + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param is_ip4 - 1 if address type is IPv4 + @param out_addr - outside IP address + @param out_port - outside port + @param ext_addr - external host address + @param ext_port - external host port +*/ +autoreply define snat_det_close_session_out { + u32 client_index; + u32 context; + u8 is_ip4; + u8 out_addr[16]; + u16 out_port; + u8 ext_addr[16]; + u16 ext_port; +}; + +/** \brief Close deterministic NAT session by inside address and port + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param is_ip4 - 1 if address type is IPv4 + @param in_addr - inside IP address + @param in_port - inside port + @param ext_addr - external host address + @param ext_port - external host port +*/ +autoreply define snat_det_close_session_in { + u32 client_index; + u32 context; + u8 is_ip4; + u8 in_addr[16]; + u16 in_port; + u8 ext_addr[16]; + u16 ext_port; +}; + +/** \brief Dump determinstic NAT sessions + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param is_ip4 - 1 if address type is IPv4 + @param user_addr - address of an inside user whose sessions to dump +*/ +define snat_det_session_dump { + u32 client_index; + u32 context; + u8 is_ip4; + u8 user_addr[16]; +}; + +/** \brief Deterministic NAT sessions reply + @param context - sender context, to match reply w/ request + @param is_ip4 - 1 if address type is IPv4 + @param in_port - inside port + @param ext_addr - external host address + @param ext_port - external host port + @param out_port - outside NAT port + @param state - session state + @param expire - session expiration timestamp +*/ +define snat_det_session_details { + u32 client_index; + u32 context; + u8 is_ip4; + u16 in_port; + u8 ext_addr[16]; + u16 ext_port; + u16 out_port; + u8 state; + u32 expire; +}; + +/* + * Common NAT plugin APIs + */ + +/** \brief Control ping from client to api server request + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request +*/ +define nat_control_ping +{ + u32 client_index; + u32 context; +}; + +/** \brief Control ping from the client to the server response + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param retval - return code for the request + @param vpe_pid - the pid of the vpe, returned by the server +*/ +define nat_control_ping_reply +{ + u32 context; + i32 retval; + u32 client_index; + u32 vpe_pid; +}; + +/** \brief Show NAT plugin startup config + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request +*/ +define nat_show_config +{ + u32 client_index; + u32 context; +}; + +/** \brief Show NAT plugin startup config reply + @param context - sender context, to match reply w/ request + @param retval - return code for the request + @param static_mapping_only - if 1 dynamic translations disabled + @param static_mapping_connection_tracking - if 1 create session data + @param deterministic - if 1 deterministic mapping + @param translation_buckets - number of translation hash buckets + @param translation_memory_size - translation hash memory size + @param user_buckets - number of user hash buckets + @param user_memory_size - user hash memory size + @param max_translations_per_user - maximum number of translations per user + @param outside_vrf_id - outside VRF id + @param inside_vrf_id - default inside VRF id +*/ +define nat_show_config_reply +{ + u32 context; + i32 retval; + u8 static_mapping_only; + u8 static_mapping_connection_tracking; + u8 deterministic; + u32 translation_buckets; + u32 translation_memory_size; + u32 user_buckets; + u32 user_memory_size; + u32 max_translations_per_user; + u32 outside_vrf_id; + u32 inside_vrf_id; +}; + +/** \brief Set NAT workers + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param worker_mask - NAT workers mask +*/ +autoreply define nat_set_workers { + u32 client_index; + u32 context; + u64 worker_mask; +}; + +/** \brief Dump NAT workers + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request +*/ +define nat_worker_dump { + u32 client_index; + u32 context; +}; + +/** \brief NAT workers details response + @param context - sender context, to match reply w/ request + @param worker_index - worker index + @param lcore_id - lcore ID + @param name - worker name +*/ +define nat_worker_details { + u32 context; + u32 worker_index; + u32 lcore_id; + u8 name[64]; +}; + +/** \brief Enable/disable NAT IPFIX logging + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param domain_id - observation domain ID + @param src_port - source port number + @param enable - 1 if enable, 0 if disable +*/ +autoreply define nat_ipfix_enable_disable { + u32 client_index; + u32 context; + u32 domain_id; + u16 src_port; + u8 enable; +}; + +/* + * NAT44 APIs + */ + +/** \brief Add/del NAT44 address range + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param first_ip_address - first IPv4 address + @param last_ip_address - last IPv4 address + @param vrf_id - VRF id of tenant, ~0 means independent of VRF + @param is_add - 1 if add, 0 if delete +*/ +autoreply define nat44_add_del_address_range { + u32 client_index; + u32 context; + u8 first_ip_address[4]; + u8 last_ip_address[4]; + u32 vrf_id; + u8 is_add; +}; + +/** \brief Dump NAT44 addresses + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request +*/ +define nat44_address_dump { + u32 client_index; + u32 context; +}; + +/** \brief NAT44 address details response + @param context - sender context, to match reply w/ request + @param ip_address - IPv4 address + @param vrf_id - VRF id of tenant, ~0 means independent of VRF +*/ +define nat44_address_details { + u32 context; + u8 ip_address[4]; + u32 vrf_id; +}; + +/** \brief Enable/disable NAT44 feature on the interface + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param is_add - 1 if add, 0 if delete + @param is_inside - 1 if inside, 0 if outside + @param sw_if_index - software index of the interface +*/ +autoreply define nat44_interface_add_del_feature { + u32 client_index; + u32 context; + u8 is_add; + u8 is_inside; + u32 sw_if_index; +}; + +/** \brief Dump interfaces with NAT44 feature + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request +*/ +define nat44_interface_dump { + u32 client_index; + u32 context; +}; + +/** \brief NAT44 interface details response + @param context - sender context, to match reply w/ request + @param is_inside - 1 if inside, 0 if outside + @param sw_if_index - software index of the interface +*/ +define nat44_interface_details { + u32 context; + u8 is_inside; + u32 sw_if_index; +}; + +/** \brief Enable/disbale NAT44 as an interface output feature (postrouting + in2out translation) + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param is_add - 1 if add, 0 if delete + @param is_inside - 1 if inside, 0 if outside + @param sw_if_index - software index of the interface +*/ +autoreply define nat44_interface_add_del_output_feature { + u32 client_index; + u32 context; + u8 is_add; + u8 is_inside; + u32 sw_if_index; +}; + +/** \brief Dump interfaces with NAT44 output feature + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request +*/ +define nat44_interface_output_feature_dump { + u32 client_index; + u32 context; +}; + +/** \brief NAT44 interface with output feature details response + @param context - sender context, to match reply w/ request + @param is_inside - 1 if inside, 0 if outside + @param sw_if_index - software index of the interface +*/ +define nat44_interface_output_feature_details { + u32 context; + u8 is_inside; + u32 sw_if_index; +}; + +/** \brief Add/delete NAT44 static mapping + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param is_add - 1 if add, 0 if delete + @param addr_only - 1 if address only mapping + @param local_ip_address - local IPv4 address + @param external_ip_address - external IPv4 address + @param protocol - IP protocol + @param local_port - local port number + @param external_port - external port number + @param external_sw_if_index - external interface (if set + external_ip_address is ignored, ~0 means not + used) + @param vfr_id - VRF ID +*/ +autoreply define nat44_add_del_static_mapping { + u32 client_index; + u32 context; + u8 is_add; + u8 addr_only; + u8 local_ip_address[4]; + u8 external_ip_address[4]; + u8 protocol; + u16 local_port; + u16 external_port; + u32 external_sw_if_index; + u32 vrf_id; +}; + +/** \brief Dump NAT44 static mappings + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request +*/ +define nat44_static_mapping_dump { + u32 client_index; + u32 context; +}; + +/** \brief NAT44 static mapping details response + @param context - sender context, to match reply w/ request + @param addr_only - 1 if address only mapping + @param local_ip_address - local IPv4 address + @param external_ip_address - external IPv4 address + @param protocol - IP protocol + @param local_port - local port number + @param external_port - external port number + @param external_sw_if_index - external interface + @param vfr_id - VRF ID +*/ +define nat44_static_mapping_details { + u32 context; + u8 addr_only; + u8 local_ip_address[4]; + u8 external_ip_address[4]; + u8 protocol; + u16 local_port; + u16 external_port; + u32 external_sw_if_index; + u32 vrf_id; +}; + +/** \brief Add/delete NAT44 pool address from specific interfce + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param is_add - 1 if add, 0 if delete + @param sw_if_index - software index of the interface +*/ +autoreply define nat44_add_del_interface_addr { + u32 client_index; + u32 context; + u8 is_add; + u8 is_inside; + u32 sw_if_index; +}; + +/** \brief Dump NAT44 pool addresses interfaces + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request +*/ +define nat44_interface_addr_dump { + u32 client_index; + u32 context; +}; + +/** \brief NAT44 pool addresses interfaces details response + @param context - sender context, to match reply w/ request + @param sw_if_index - software index of the interface +*/ +define nat44_interface_addr_details { + u32 context; + u32 sw_if_index; +}; + +/** \brief Dump NAT44 users + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request +*/ +define nat44_user_dump { + u32 client_index; + u32 context; +}; + +/** \brief NAT44 users response + @param context - sender context, to match reply w/ request + @vrf_id - VRF ID + @param ip_adress - IPv4 address + @param nsessions - number of dynamic sessions + @param nstaticsessions - number of static sessions +*/ +define nat44_user_details { + u32 context; + u32 vrf_id; + u8 ip_address[4]; + u32 nsessions; + u32 nstaticsessions; +}; + +/** \brief NAT44 user's sessions + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param is_ip4 - 1 if address type is IPv4 + @param user_ip - IP address of the user to dump + @param vrf_id - VRF_ID +*/ +define nat44_user_session_dump { + u32 client_index; + u32 context; + u8 ip_address[4]; + u32 vrf_id; +}; + +/** \brief NAT44 user's sessions response + @param context - sender context, to match reply w/ request + @param outside_ip_address - outside IPv4 address + @param outside_port - outside port + @param inside_ip_address - inside IPv4 address + @param inside_port - inside port + @param protocol - protocol + @param is_static - 1 if session is static + @param last_heard - last heard timer + @param total_bytes - count of bytes sent through session + @param total_pkts - count of pakets sent through session +*/ +define nat44_user_session_details { + u32 context; + u8 outside_ip_address[4]; + u16 outside_port; + u8 inside_ip_address[4]; + u16 inside_port; + u16 protocol; + u8 is_static; + u64 last_heard; + u64 total_bytes; + u32 total_pkts; +}; + +typeonly manual_endian define nat44_lb_addr_port { + u8 addr[4]; + u16 port; + u8 probability; +}; + +autoreply manual_endian define nat44_add_del_lb_static_mapping { + u32 client_index; + u32 context; + u8 is_add; + u8 external_addr[4]; + u16 external_port; + u8 protocol; + u32 vrf_id; + u8 local_num; + vl_api_nat44_lb_addr_port_t locals[local_num]; +}; + +define nat44_lb_static_mapping_dump { + u32 client_index; + u32 context; +}; + +manual_endian define nat44_lb_static_mapping_details { + u32 context; + u8 external_addr[4]; + u16 external_port; + u8 protocol; + u32 vrf_id; + u8 local_num; + vl_api_nat44_lb_addr_port_t locals[local_num]; +}; + +/* + * Deterministic NAT (CGN) APIs + */ + +/** \brief Add/delete NAT deterministic mapping + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param is_add - 1 if add, 0 if delete + @param is_nat44 - 1 if NAT44 + @param in_addr - inside IP address + @param in_plen - inside IP address prefix length + @param out_addr - outside IPv4 address + @param out_addr - outside IPv4 address prefix length +*/ +autoreply define nat_det_add_del_map { + u32 client_index; + u32 context; + u8 is_add; + u8 is_nat44; + u8 addr_only; + u8 in_addr[16]; + u8 in_plen; + u8 out_addr[4]; + u8 out_plen; +}; + +/** \brief Get outside address and port range from inside address + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param is_nat44 - 1 if NAT44 + @param in_addr - inside IP address +*/ +define nat_det_forward { + u32 client_index; + u32 context; + u8 is_nat44; + u8 in_addr[16]; +}; + +/** \brief Get outside address and port range from inside address + @param context - sender context, to match reply w/ request + @param retval - return code + @param out_port_lo - outside port range start + @param out_port_hi - outside port range end + @param out_addr - outside IPv4 address +*/ +define nat_det_forward_reply { + u32 context; + i32 retval; + u16 out_port_lo; + u16 out_port_hi; + u8 out_addr[4]; +}; + +/** \brief Get inside address from outside address and port + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param out_port - outside port + @param out_addr - outside IPv4 address +*/ +define nat_det_reverse { + u32 client_index; + u32 context; + u16 out_port; + u8 out_addr[4]; +}; + +/** \brief Get inside address from outside address and port reply + @param context - sender context, to match reply w/ request + @param retval - return code + @param is_nat44 - 1 if NAT44 + @param in_addr - inside IP address +*/ +define nat_det_reverse_reply { + u32 context; + i32 retval; + u8 is_nat44; + u8 in_addr[16]; +}; + +/** \brief Dump NAT deterministic mappings + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request +*/ +define nat_det_map_dump { + u32 client_index; + u32 context; +}; + +/** \brief NAT users response + @param context - sender context, to match reply w/ request + @param is_nat44 - 1 if NAT44 + @param in_addr - inside IP address + @param in_plen - inside IP address prefix length + @param out_addr - outside IPv4 address + @param out_plen - outside IPv4 address prefix length + @param sharing_ratio - outside to inside address sharing ratio + @param ports_per_host - number of ports available to a host + @param ses_num - number of sessions belonging to this mapping +*/ +define nat_det_map_details { + u32 context; + u8 is_nat44; + u8 in_addr[16]; + u8 in_plen; + u8 out_addr[4]; + u8 out_plen; + u32 sharing_ratio; + u16 ports_per_host; + u32 ses_num; +}; + +/** \brief Set values of timeouts for deterministic NAT (seconds, 0 = default) + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param udp - UDP timeout (default 300sec) + @param tcp_established - TCP established timeout (default 7440sec) + @param tcp_transitory - TCP transitory timeout (default 240sec) + @param icmp - ICMP timeout (default 60sec) +*/ +autoreply define nat_det_set_timeouts { + u32 client_index; + u32 context; + u32 udp; + u32 tcp_established; + u32 tcp_transitory; + u32 icmp; +}; + +/** \brief Get values of timeouts for deterministic NAT (seconds) + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request +*/ +define nat_det_get_timeouts { + u32 client_index; + u32 context; +}; + +/** \brief Get values of timeouts for deterministic NAT reply + @param context - sender context, to match reply w/ request + @param retval - return code + @param udp - UDP timeout (default 300sec) + @param tcp_established - TCP established timeout (default 7440sec) + @param tcp_transitory - TCP transitory timeout (default 240sec) + @param icmp - ICMP timeout (default 60sec) +*/ +define nat_det_get_timeouts_reply { + u32 context; + i32 retval; + u32 udp; + u32 tcp_established; + u32 tcp_transitory; + u32 icmp; +}; + +/** \brief Close deterministic NAT session by outside address and port + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param out_addr - outside IPv4 address + @param out_port - outside port + @param ext_addr - external host IPv4 address + @param ext_port - external host port +*/ +autoreply define nat_det_close_session_out { + u32 client_index; + u32 context; + u8 out_addr[4]; + u16 out_port; + u8 ext_addr[4]; + u16 ext_port; +}; + +/** \brief Close deterministic NAT session by inside address and port + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param is_nat44 - 1 if NAT44 + @param in_addr - inside IP address + @param in_port - inside port + @param ext_addr - external host IP address + @param ext_port - external host port +*/ +autoreply define nat_det_close_session_in { + u32 client_index; + u32 context; + u8 is_nat44; + u8 in_addr[16]; + u16 in_port; + u8 ext_addr[16]; + u16 ext_port; +}; + +/** \brief Dump determinstic NAT sessions + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param is_nat44 - 1 if NAT44 + @param user_addr - address of an inside user whose sessions to dump +*/ +define nat_det_session_dump { + u32 client_index; + u32 context; + u8 is_nat44; + u8 user_addr[16]; +}; + +/** \brief Deterministic NAT sessions reply + @param context - sender context, to match reply w/ request + @param in_port - inside port + @param ext_addr - external host address + @param ext_port - external host port + @param out_port - outside NAT port + @param state - session state + @param expire - session expiration timestamp +*/ +define nat_det_session_details { + u32 client_index; + u32 context; + u16 in_port; + u8 ext_addr[4]; + u16 ext_port; + u16 out_port; + u8 state; + u32 expire; +}; + +/* + * NAT64 APIs + */ + +/** \brief Add/delete address range to NAT64 pool + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param start_addr - start address of the range + @param end_addr - end address of the range + @param vrf_id - VRF id of tenant, ~0 means independent of VRF + @param is_add - 1 if add, 0 if delete +*/ +autoreply define nat64_add_del_pool_addr_range { + u32 client_index; + u32 context; + u8 start_addr[4]; + u8 end_addr[4]; + u32 vrf_id; + u8 is_add; +}; + +/** \brief Dump NAT64 pool addresses + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request +*/ +define nat64_pool_addr_dump { + u32 client_index; + u32 context; +}; + +/** \brief NAT64 pool address details response + @param context - sender context, to match reply w/ request + @param address - IPv4 address + @param vfr_id - VRF id of tenant, ~0 means independent of VRF +*/ +define nat64_pool_addr_details { + u32 context; + u8 address[4]; + u32 vrf_id; +}; + +/** \brief Enable/disable NAT64 feature on the interface + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param sw_if_index - index of the interface + @param is_inside - 1 if inside, 0 if outside + @param is_add - 1 if add, 0 if delete +*/ +autoreply define nat64_add_del_interface { + u32 client_index; + u32 context; + u32 sw_if_index; + u8 is_inside; + u8 is_add; +}; + +/** \brief Dump interfaces with NAT64 feature + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request +*/ +define nat64_interface_dump { + u32 client_index; + u32 context; +}; + +/** \brief NAT64 interface details response + @param context - sender context, to match reply w/ request + @param is_inside - 1 if inside, 0 if outside + @param sw_if_index - index of the interface +*/ +define nat64_interface_details { + u32 context; + u8 is_inside; + u32 sw_if_index; +}; + +/** \brief Add/delete NAT64 static BIB entry + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param i_addr - inside IPv6 address + @param o_addr - outside IPv4 address + @param i_port - inside port number + @param o_port - outside port number + @param vrf_id - VRF id of tenant + @param proto - protocol number + @param is_add - 1 if add, 0 if delete +*/ + autoreply define nat64_add_del_static_bib { + u32 client_index; + u32 context; + u8 i_addr[16]; + u8 o_addr[4]; + u16 i_port; + u16 o_port; + u32 vrf_id; + u8 proto; + u8 is_add; +}; + +/** \brief Dump NAT64 BIB + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param proto - protocol of the BIB: 255 - all BIBs + 6 - TCP BIB + 17 - UDP BIB + 1/58 - ICMP BIB + otherwise - "unknown" protocol BIB +*/ +define nat64_bib_dump { + u32 client_index; + u32 context; + u8 proto; +}; + +/** \brief NAT64 BIB details response + @param context - sender context, to match reply w/ request + @param i_addr - inside IPv6 address + @param o_addr - outside IPv4 address + @param i_port - inside port number + @param o_port - outside port number + @param vrf_id - VRF id of tenant + @param proto - protocol number + @param is_static - 1 if static BIB entry, 0 if dynamic + @param ses_num - number of sessions associated with the BIB entry +*/ +define nat64_bib_details { + u32 context; + u8 i_addr[16]; + u8 o_addr[4]; + u16 i_port; + u16 o_port; + u32 vrf_id; + u8 proto; + u8 is_static; + u32 ses_num; +}; + +/** \brief Set values of timeouts for NAT64 (seconds, 0 = default) + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param udp - UDP timeout (default 300sec) + @param icmp - ICMP timeout (default 60sec) + @param tcp_trans - TCP transitory timeout (default 240sec) + @param tcp_est - TCP established timeout (default 7440sec) + @param tcp_incoming_syn - TCP incoming SYN timeout (default 6sec) +*/ +autoreply define nat64_set_timeouts { + u32 client_index; + u32 context; + u32 udp; + u32 icmp; + u32 tcp_trans; + u32 tcp_est; + u32 tcp_incoming_syn; +}; + +/** \brief Get values of timeouts for NAT64 (seconds) + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request +*/ +define nat64_get_timeouts { + u32 client_index; + u32 context; +}; + +/** \brief Get values of timeouts for NAT64 reply + @param context - sender context, to match reply w/ request + @param retval - return code + @param udp - UDP timeout + @param icmp - ICMP timeout + @param tcp_trans - TCP transitory timeout + @param tcp_est - TCP established timeout + @param tcp_incoming_syn - TCP incoming SYN timeout +*/ +define nat64_get_timeouts_reply { + u32 context; + i32 retval; + u32 udp; + u32 icmp; + u32 tcp_trans; + u32 tcp_est; + u32 tcp_incoming_syn; +}; + +/** \brief Dump NAT64 session table + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param proto - protocol of the session table: 255 - all STs + 6 - TCP ST + 17 - UDP ST + 1/58 - ICMP ST + otherwise - "unknown" proto ST +*/ +define nat64_st_dump { + u32 client_index; + u32 context; + u8 proto; +}; + +/** \brief NAT64 session table details response + @param context - sender context, to match reply w/ request + @param il_addr - inside IPv6 address of the local host + @param ol_addr - outside IPv4 address of the local host + @param il_port - inside port number id of the local host/inside ICMP id + @param ol_port - outside port number of the local host/outside ICMP id + @param il_addr - inside IPv6 address of the remote host + @param ol_addr - outside IPv4 address of the remote host + @param l_port - port number of the remote host (not used for ICMP) + @param vrf_id - VRF id of tenant + @param proto - protocol number +*/ +define nat64_st_details { + u32 context; + u8 il_addr[16]; + u8 ol_addr[4]; + u16 il_port; + u16 ol_port; + u8 ir_addr[16]; + u8 or_addr[4]; + u16 r_port; + u32 vrf_id; + u8 proto; +}; + +/** \brief Add/del NAT64 prefix + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param prefix - NAT64 prefix + @param prefix - NAT64 prefix length + @param vrf_id - VRF id of tenant + @param is_add - 1 if add, 0 if delete +*/ +autoreply define nat64_add_del_prefix { + u32 client_index; + u32 context; + u8 prefix[16]; + u8 prefix_len; + u32 vrf_id; + u8 is_add; +}; + +/** \brief Dump NAT64 prefix + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request +*/ +define nat64_prefix_dump { + u32 client_index; + u32 context; +}; + +/** \brief Dump NAT64 prefix details response + @param context - sender context, to match reply w/ request + @param prefix - NAT64 prefix + @param prefix - NAT64 prefix length + @param vrf_id - VRF id of tenant +*/ +define nat64_prefix_details { + u32 context; + u8 prefix[16]; + u8 prefix_len; + u32 vrf_id; +}; diff --git a/src/plugins/nat/nat.c b/src/plugins/nat/nat.c new file mode 100644 index 00000000..9bdb0351 --- /dev/null +++ b/src/plugins/nat/nat.c @@ -0,0 +1,3229 @@ +/* + * snat.c - simple nat plugin + * + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vnet/vnet.h> +#include <vnet/ip/ip.h> +#include <vnet/ip/ip4.h> +#include <vnet/plugin/plugin.h> +#include <nat/nat.h> +#include <nat/nat_ipfix_logging.h> +#include <nat/nat_det.h> +#include <nat/nat64.h> +#include <vnet/fib/fib_table.h> +#include <vnet/fib/ip4_fib.h> + +#include <vpp/app/version.h> + +snat_main_t snat_main; + + +/* Hook up input features */ +VNET_FEATURE_INIT (ip4_snat_in2out, static) = { + .arc_name = "ip4-unicast", + .node_name = "nat44-in2out", + .runs_before = VNET_FEATURES ("nat44-out2in"), +}; +VNET_FEATURE_INIT (ip4_snat_out2in, static) = { + .arc_name = "ip4-unicast", + .node_name = "nat44-out2in", + .runs_before = VNET_FEATURES ("ip4-lookup"), +}; +VNET_FEATURE_INIT (ip4_snat_det_in2out, static) = { + .arc_name = "ip4-unicast", + .node_name = "nat44-det-in2out", + .runs_before = VNET_FEATURES ("nat44-det-out2in"), +}; +VNET_FEATURE_INIT (ip4_snat_det_out2in, static) = { + .arc_name = "ip4-unicast", + .node_name = "nat44-det-out2in", + .runs_before = VNET_FEATURES ("ip4-lookup"), +}; +VNET_FEATURE_INIT (ip4_snat_in2out_worker_handoff, static) = { + .arc_name = "ip4-unicast", + .node_name = "nat44-in2out-worker-handoff", + .runs_before = VNET_FEATURES ("nat44-out2in-worker-handoff"), +}; +VNET_FEATURE_INIT (ip4_snat_out2in_worker_handoff, static) = { + .arc_name = "ip4-unicast", + .node_name = "nat44-out2in-worker-handoff", + .runs_before = VNET_FEATURES ("ip4-lookup"), +}; +VNET_FEATURE_INIT (ip4_snat_in2out_fast, static) = { + .arc_name = "ip4-unicast", + .node_name = "nat44-in2out-fast", + .runs_before = VNET_FEATURES ("nat44-out2in-fast"), +}; +VNET_FEATURE_INIT (ip4_snat_out2in_fast, static) = { + .arc_name = "ip4-unicast", + .node_name = "nat44-out2in-fast", + .runs_before = VNET_FEATURES ("ip4-lookup"), +}; +VNET_FEATURE_INIT (ip4_snat_hairpin_dst, static) = { + .arc_name = "ip4-unicast", + .node_name = "nat44-hairpin-dst", + .runs_before = VNET_FEATURES ("ip4-lookup"), +}; + +/* Hook up output features */ +VNET_FEATURE_INIT (ip4_snat_in2out_output, static) = { + .arc_name = "ip4-output", + .node_name = "nat44-in2out-output", + .runs_before = VNET_FEATURES ("interface-output"), +}; +VNET_FEATURE_INIT (ip4_snat_in2out_output_worker_handoff, static) = { + .arc_name = "ip4-output", + .node_name = "nat44-in2out-output-worker-handoff", + .runs_before = VNET_FEATURES ("interface-output"), +}; +VNET_FEATURE_INIT (ip4_snat_hairpin_src, static) = { + .arc_name = "ip4-output", + .node_name = "nat44-hairpin-src", + .runs_before = VNET_FEATURES ("interface-output"), +}; + + +/* *INDENT-OFF* */ +VLIB_PLUGIN_REGISTER () = { + .version = VPP_BUILD_VER, + .description = "Network Address Translation", +}; +/* *INDENT-ON* */ + +/** + * @brief Add/del NAT address to FIB. + * + * Add the external NAT address to the FIB as receive entries. This ensures + * that VPP will reply to ARP for this address and we don't need to enable + * proxy ARP on the outside interface. + * + * @param addr IPv4 address. + * @param plen address prefix length + * @param sw_if_index Interface. + * @param is_add If 0 delete, otherwise add. + */ +void +snat_add_del_addr_to_fib (ip4_address_t * addr, u8 p_len, u32 sw_if_index, + int is_add) +{ + fib_prefix_t prefix = { + .fp_len = p_len, + .fp_proto = FIB_PROTOCOL_IP4, + .fp_addr = { + .ip4.as_u32 = addr->as_u32, + }, + }; + u32 fib_index = ip4_fib_table_get_index_for_sw_if_index(sw_if_index); + + if (is_add) + fib_table_entry_update_one_path(fib_index, + &prefix, + FIB_SOURCE_PLUGIN_HI, + (FIB_ENTRY_FLAG_CONNECTED | + FIB_ENTRY_FLAG_LOCAL | + FIB_ENTRY_FLAG_EXCLUSIVE), + DPO_PROTO_IP4, + NULL, + sw_if_index, + ~0, + 1, + NULL, + FIB_ROUTE_PATH_FLAG_NONE); + else + fib_table_entry_delete(fib_index, + &prefix, + FIB_SOURCE_PLUGIN_HI); +} + +void snat_add_address (snat_main_t *sm, ip4_address_t *addr, u32 vrf_id) +{ + snat_address_t * ap; + snat_interface_t *i; + vlib_thread_main_t *tm = vlib_get_thread_main (); + + if (vrf_id != ~0) + sm->vrf_mode = 1; + + /* Check if address already exists */ + vec_foreach (ap, sm->addresses) + { + if (ap->addr.as_u32 == addr->as_u32) + return; + } + + vec_add2 (sm->addresses, ap, 1); + ap->addr = *addr; + if (vrf_id != ~0) + ap->fib_index = + fib_table_find_or_create_and_lock (FIB_PROTOCOL_IP4, vrf_id, + FIB_SOURCE_PLUGIN_HI); + else + ap->fib_index = ~0; +#define _(N, i, n, s) \ + clib_bitmap_alloc (ap->busy_##n##_port_bitmap, 65535); \ + ap->busy_##n##_ports = 0; \ + vec_validate_init_empty (ap->busy_##n##_ports_per_thread, tm->n_vlib_mains - 1, 0); + foreach_snat_protocol +#undef _ + + /* Add external address to FIB */ + pool_foreach (i, sm->interfaces, + ({ + if (i->is_inside) + continue; + + snat_add_del_addr_to_fib(addr, 32, i->sw_if_index, 1); + break; + })); + pool_foreach (i, sm->output_feature_interfaces, + ({ + if (i->is_inside) + continue; + + snat_add_del_addr_to_fib(addr, 32, i->sw_if_index, 1); + break; + })); +} + +static int is_snat_address_used_in_static_mapping (snat_main_t *sm, + ip4_address_t addr) +{ + snat_static_mapping_t *m; + pool_foreach (m, sm->static_mappings, + ({ + if (m->external_addr.as_u32 == addr.as_u32) + return 1; + })); + + return 0; +} + +void increment_v4_address (ip4_address_t * a) +{ + u32 v; + + v = clib_net_to_host_u32(a->as_u32) + 1; + a->as_u32 = clib_host_to_net_u32(v); +} + +static void +snat_add_static_mapping_when_resolved (snat_main_t * sm, + ip4_address_t l_addr, + u16 l_port, + u32 sw_if_index, + u16 e_port, + u32 vrf_id, + snat_protocol_t proto, + int addr_only, + int is_add) +{ + snat_static_map_resolve_t *rp; + + vec_add2 (sm->to_resolve, rp, 1); + rp->l_addr.as_u32 = l_addr.as_u32; + rp->l_port = l_port; + rp->sw_if_index = sw_if_index; + rp->e_port = e_port; + rp->vrf_id = vrf_id; + rp->proto = proto; + rp->addr_only = addr_only; + rp->is_add = is_add; +} + +/** + * @brief Add static mapping. + * + * Create static mapping between local addr+port and external addr+port. + * + * @param l_addr Local IPv4 address. + * @param e_addr External IPv4 address. + * @param l_port Local port number. + * @param e_port External port number. + * @param vrf_id VRF ID. + * @param addr_only If 0 address port and pair mapping, otherwise address only. + * @param sw_if_index External port instead of specific IP address. + * @param is_add If 0 delete static mapping, otherwise add. + * + * @returns + */ +int snat_add_static_mapping(ip4_address_t l_addr, ip4_address_t e_addr, + u16 l_port, u16 e_port, u32 vrf_id, int addr_only, + u32 sw_if_index, snat_protocol_t proto, int is_add) +{ + snat_main_t * sm = &snat_main; + snat_static_mapping_t *m; + snat_session_key_t m_key; + clib_bihash_kv_8_8_t kv, value; + snat_address_t *a = 0; + u32 fib_index = ~0; + uword * p; + snat_interface_t *interface; + int i; + + /* If the external address is a specific interface address */ + if (sw_if_index != ~0) + { + ip4_address_t * first_int_addr; + + /* Might be already set... */ + first_int_addr = ip4_interface_first_address + (sm->ip4_main, sw_if_index, 0 /* just want the address*/); + + /* DHCP resolution required? */ + if (first_int_addr == 0) + { + snat_add_static_mapping_when_resolved + (sm, l_addr, l_port, sw_if_index, e_port, vrf_id, proto, + addr_only, is_add); + return 0; + } + else + e_addr.as_u32 = first_int_addr->as_u32; + } + + m_key.addr = e_addr; + m_key.port = addr_only ? 0 : e_port; + m_key.protocol = addr_only ? 0 : proto; + m_key.fib_index = sm->outside_fib_index; + kv.key = m_key.as_u64; + if (clib_bihash_search_8_8 (&sm->static_mapping_by_external, &kv, &value)) + m = 0; + else + m = pool_elt_at_index (sm->static_mappings, value.value); + + if (is_add) + { + if (m) + return VNET_API_ERROR_VALUE_EXIST; + + /* Convert VRF id to FIB index */ + if (vrf_id != ~0) + { + p = hash_get (sm->ip4_main->fib_index_by_table_id, vrf_id); + if (!p) + return VNET_API_ERROR_NO_SUCH_FIB; + fib_index = p[0]; + } + /* If not specified use inside VRF id from SNAT plugin startup config */ + else + { + fib_index = sm->inside_fib_index; + vrf_id = sm->inside_vrf_id; + } + + /* Find external address in allocated addresses and reserve port for + address and port pair mapping when dynamic translations enabled */ + if (!addr_only && !(sm->static_mapping_only)) + { + for (i = 0; i < vec_len (sm->addresses); i++) + { + if (sm->addresses[i].addr.as_u32 == e_addr.as_u32) + { + a = sm->addresses + i; + /* External port must be unused */ + switch (proto) + { +#define _(N, j, n, s) \ + case SNAT_PROTOCOL_##N: \ + if (clib_bitmap_get_no_check (a->busy_##n##_port_bitmap, e_port)) \ + return VNET_API_ERROR_INVALID_VALUE; \ + clib_bitmap_set_no_check (a->busy_##n##_port_bitmap, e_port, 1); \ + if (e_port > 1024) \ + { \ + a->busy_##n##_ports++; \ + a->busy_##n##_ports_per_thread[(e_port - 1024) / sm->port_per_thread]++; \ + } \ + break; + foreach_snat_protocol +#undef _ + default: + clib_warning("unknown_protocol"); + return VNET_API_ERROR_INVALID_VALUE_2; + } + break; + } + } + /* External address must be allocated */ + if (!a) + return VNET_API_ERROR_NO_SUCH_ENTRY; + } + + pool_get (sm->static_mappings, m); + memset (m, 0, sizeof (*m)); + m->local_addr = l_addr; + m->external_addr = e_addr; + m->addr_only = addr_only; + m->vrf_id = vrf_id; + m->fib_index = fib_index; + if (!addr_only) + { + m->local_port = l_port; + m->external_port = e_port; + m->proto = proto; + } + + m_key.addr = m->local_addr; + m_key.port = m->local_port; + m_key.protocol = m->proto; + m_key.fib_index = m->fib_index; + kv.key = m_key.as_u64; + kv.value = m - sm->static_mappings; + clib_bihash_add_del_8_8(&sm->static_mapping_by_local, &kv, 1); + + m_key.addr = m->external_addr; + m_key.port = m->external_port; + m_key.fib_index = sm->outside_fib_index; + kv.key = m_key.as_u64; + kv.value = m - sm->static_mappings; + clib_bihash_add_del_8_8(&sm->static_mapping_by_external, &kv, 1); + + if (sm->workers) + { + ip4_header_t ip = { + .src_address = m->local_addr, + }; + m->worker_index = sm->worker_in2out_cb (&ip, m->fib_index); + } + } + else + { + if (!m) + return VNET_API_ERROR_NO_SUCH_ENTRY; + + /* Free external address port */ + if (!addr_only && !(sm->static_mapping_only)) + { + for (i = 0; i < vec_len (sm->addresses); i++) + { + if (sm->addresses[i].addr.as_u32 == e_addr.as_u32) + { + a = sm->addresses + i; + switch (proto) + { +#define _(N, j, n, s) \ + case SNAT_PROTOCOL_##N: \ + clib_bitmap_set_no_check (a->busy_##n##_port_bitmap, e_port, 0); \ + if (e_port > 1024) \ + { \ + a->busy_##n##_ports--; \ + a->busy_##n##_ports_per_thread[(e_port - 1024) / sm->port_per_thread]--; \ + } \ + break; + foreach_snat_protocol +#undef _ + default: + clib_warning("unknown_protocol"); + return VNET_API_ERROR_INVALID_VALUE_2; + } + break; + } + } + } + + m_key.addr = m->local_addr; + m_key.port = m->local_port; + m_key.protocol = m->proto; + m_key.fib_index = m->fib_index; + kv.key = m_key.as_u64; + clib_bihash_add_del_8_8(&sm->static_mapping_by_local, &kv, 0); + + m_key.addr = m->external_addr; + m_key.port = m->external_port; + m_key.fib_index = sm->outside_fib_index; + kv.key = m_key.as_u64; + clib_bihash_add_del_8_8(&sm->static_mapping_by_external, &kv, 0); + + /* Delete session(s) for static mapping if exist */ + if (!(sm->static_mapping_only) || + (sm->static_mapping_only && sm->static_mapping_connection_tracking)) + { + snat_user_key_t u_key; + snat_user_t *u; + dlist_elt_t * head, * elt; + u32 elt_index, head_index, del_elt_index; + u32 ses_index; + u64 user_index; + snat_session_t * s; + snat_main_per_thread_data_t *tsm; + + u_key.addr = m->local_addr; + u_key.fib_index = m->fib_index; + kv.key = u_key.as_u64; + if (sm->num_workers > 1) + tsm = vec_elt_at_index (sm->per_thread_data, m->worker_index); + else + tsm = vec_elt_at_index (sm->per_thread_data, sm->num_workers); + if (!clib_bihash_search_8_8 (&tsm->user_hash, &kv, &value)) + { + user_index = value.value; + u = pool_elt_at_index (tsm->users, user_index); + if (u->nstaticsessions) + { + head_index = u->sessions_per_user_list_head_index; + head = pool_elt_at_index (tsm->list_pool, head_index); + elt_index = head->next; + elt = pool_elt_at_index (tsm->list_pool, elt_index); + ses_index = elt->value; + while (ses_index != ~0) + { + s = pool_elt_at_index (tsm->sessions, ses_index); + del_elt_index = elt_index; + elt_index = elt->next; + elt = pool_elt_at_index (tsm->list_pool, elt_index); + ses_index = elt->value; + + if (!addr_only) + { + if ((s->out2in.addr.as_u32 != e_addr.as_u32) && + (clib_net_to_host_u16 (s->out2in.port) != e_port)) + continue; + } + + if (snat_is_unk_proto_session (s)) + { + clib_bihash_kv_16_8_t up_kv; + nat_ed_ses_key_t up_key; + up_key.l_addr = s->in2out.addr; + up_key.r_addr = s->ext_host_addr; + up_key.fib_index = s->in2out.fib_index; + up_key.proto = s->in2out.port; + up_key.rsvd = 0; + up_key.l_port = 0; + up_kv.key[0] = up_key.as_u64[0]; + up_kv.key[1] = up_key.as_u64[1]; + if (clib_bihash_add_del_16_8 (&sm->in2out_ed, + &up_kv, 0)) + clib_warning ("in2out key del failed"); + + up_key.l_addr = s->out2in.addr; + up_key.fib_index = s->out2in.fib_index; + up_kv.key[0] = up_key.as_u64[0]; + up_kv.key[1] = up_key.as_u64[1]; + if (clib_bihash_add_del_16_8 (&sm->out2in_ed, + &up_kv, 0)) + clib_warning ("out2in key del failed"); + + goto delete; + } + /* log NAT event */ + snat_ipfix_logging_nat44_ses_delete(s->in2out.addr.as_u32, + s->out2in.addr.as_u32, + s->in2out.protocol, + s->in2out.port, + s->out2in.port, + s->in2out.fib_index); + + value.key = s->in2out.as_u64; + if (clib_bihash_add_del_8_8 (&tsm->in2out, &value, 0)) + clib_warning ("in2out key del failed"); + value.key = s->out2in.as_u64; + if (clib_bihash_add_del_8_8 (&tsm->out2in, &value, 0)) + clib_warning ("out2in key del failed"); +delete: + pool_put (tsm->sessions, s); + + clib_dlist_remove (tsm->list_pool, del_elt_index); + pool_put_index (tsm->list_pool, del_elt_index); + u->nstaticsessions--; + + if (!addr_only) + break; + } + if (addr_only) + { + pool_put (tsm->users, u); + clib_bihash_add_del_8_8 (&tsm->user_hash, &kv, 0); + } + } + } + } + + /* Delete static mapping from pool */ + pool_put (sm->static_mappings, m); + } + + if (!addr_only) + return 0; + + /* Add/delete external address to FIB */ + pool_foreach (interface, sm->interfaces, + ({ + if (interface->is_inside) + continue; + + snat_add_del_addr_to_fib(&e_addr, 32, interface->sw_if_index, is_add); + break; + })); + pool_foreach (interface, sm->output_feature_interfaces, + ({ + if (interface->is_inside) + continue; + + snat_add_del_addr_to_fib(&e_addr, 32, interface->sw_if_index, is_add); + break; + })); + + return 0; +} + +int nat44_add_del_lb_static_mapping (ip4_address_t e_addr, u16 e_port, + snat_protocol_t proto, u32 vrf_id, + nat44_lb_addr_port_t *locals, u8 is_add) +{ + snat_main_t * sm = &snat_main; + snat_static_mapping_t *m; + snat_session_key_t m_key; + clib_bihash_kv_8_8_t kv, value; + u32 fib_index; + snat_address_t *a = 0; + int i; + nat44_lb_addr_port_t *local; + u32 worker_index = 0; + snat_main_per_thread_data_t *tsm; + + m_key.addr = e_addr; + m_key.port = e_port; + m_key.protocol = proto; + m_key.fib_index = sm->outside_fib_index; + kv.key = m_key.as_u64; + if (clib_bihash_search_8_8 (&sm->static_mapping_by_external, &kv, &value)) + m = 0; + else + m = pool_elt_at_index (sm->static_mappings, value.value); + + if (is_add) + { + if (m) + return VNET_API_ERROR_VALUE_EXIST; + + if (vec_len (locals) < 2) + return VNET_API_ERROR_INVALID_VALUE; + + fib_index = fib_table_find_or_create_and_lock (FIB_PROTOCOL_IP4, + vrf_id, + FIB_SOURCE_PLUGIN_HI); + + /* Find external address in allocated addresses and reserve port for + address and port pair mapping when dynamic translations enabled */ + if (!sm->static_mapping_only) + { + for (i = 0; i < vec_len (sm->addresses); i++) + { + if (sm->addresses[i].addr.as_u32 == e_addr.as_u32) + { + a = sm->addresses + i; + /* External port must be unused */ + switch (proto) + { +#define _(N, j, n, s) \ + case SNAT_PROTOCOL_##N: \ + if (clib_bitmap_get_no_check (a->busy_##n##_port_bitmap, e_port)) \ + return VNET_API_ERROR_INVALID_VALUE; \ + clib_bitmap_set_no_check (a->busy_##n##_port_bitmap, e_port, 1); \ + if (e_port > 1024) \ + { \ + a->busy_##n##_ports++; \ + a->busy_##n##_ports_per_thread[(e_port - 1024) / sm->port_per_thread]++; \ + } \ + break; + foreach_snat_protocol +#undef _ + default: + clib_warning("unknown_protocol"); + return VNET_API_ERROR_INVALID_VALUE_2; + } + break; + } + } + /* External address must be allocated */ + if (!a) + return VNET_API_ERROR_NO_SUCH_ENTRY; + } + + pool_get (sm->static_mappings, m); + memset (m, 0, sizeof (*m)); + m->external_addr = e_addr; + m->addr_only = 0; + m->vrf_id = vrf_id; + m->fib_index = fib_index; + m->external_port = e_port; + m->proto = proto; + + m_key.addr = m->external_addr; + m_key.port = m->external_port; + m_key.protocol = m->proto; + m_key.fib_index = sm->outside_fib_index; + kv.key = m_key.as_u64; + kv.value = m - sm->static_mappings; + if (clib_bihash_add_del_8_8(&sm->static_mapping_by_external, &kv, 1)) + { + clib_warning ("static_mapping_by_external key add failed"); + return VNET_API_ERROR_UNSPECIFIED; + } + + /* Assign worker */ + if (sm->workers) + { + worker_index = sm->first_worker_index + + sm->workers[sm->next_worker++ % vec_len (sm->workers)]; + tsm = vec_elt_at_index (sm->per_thread_data, worker_index); + m->worker_index = worker_index; + } + else + tsm = vec_elt_at_index (sm->per_thread_data, sm->num_workers); + + m_key.port = clib_host_to_net_u16 (m->external_port); + kv.key = m_key.as_u64; + kv.value = ~0ULL; + if (clib_bihash_add_del_8_8(&tsm->out2in, &kv, 1)) + { + clib_warning ("static_mapping_by_local key add failed"); + return VNET_API_ERROR_UNSPECIFIED; + } + + m_key.fib_index = m->fib_index; + for (i = 0; i < vec_len (locals); i++) + { + m_key.addr = locals[i].addr; + m_key.port = locals[i].port; + kv.key = m_key.as_u64; + kv.value = m - sm->static_mappings; + clib_bihash_add_del_8_8(&sm->static_mapping_by_local, &kv, 1); + locals[i].prefix = (i == 0) ? locals[i].probability :\ + (locals[i - 1].prefix + locals[i].probability); + vec_add1 (m->locals, locals[i]); + + m_key.port = clib_host_to_net_u16 (locals[i].port); + kv.key = m_key.as_u64; + kv.value = ~0ULL; + if (clib_bihash_add_del_8_8(&tsm->in2out, &kv, 1)) + { + clib_warning ("in2out key add failed"); + return VNET_API_ERROR_UNSPECIFIED; + } + } + } + else + { + if (!m) + return VNET_API_ERROR_NO_SUCH_ENTRY; + + fib_table_unlock (m->fib_index, FIB_PROTOCOL_IP4, FIB_SOURCE_PLUGIN_HI); + + /* Free external address port */ + if (!sm->static_mapping_only) + { + for (i = 0; i < vec_len (sm->addresses); i++) + { + if (sm->addresses[i].addr.as_u32 == e_addr.as_u32) + { + a = sm->addresses + i; + switch (proto) + { +#define _(N, j, n, s) \ + case SNAT_PROTOCOL_##N: \ + clib_bitmap_set_no_check (a->busy_##n##_port_bitmap, e_port, 0); \ + if (e_port > 1024) \ + { \ + a->busy_##n##_ports--; \ + a->busy_##n##_ports_per_thread[(e_port - 1024) / sm->port_per_thread]--; \ + } \ + break; + foreach_snat_protocol +#undef _ + default: + clib_warning("unknown_protocol"); + return VNET_API_ERROR_INVALID_VALUE_2; + } + break; + } + } + } + + tsm = vec_elt_at_index (sm->per_thread_data, m->worker_index); + m_key.addr = m->external_addr; + m_key.port = m->external_port; + m_key.protocol = m->proto; + m_key.fib_index = sm->outside_fib_index; + kv.key = m_key.as_u64; + if (clib_bihash_add_del_8_8(&sm->static_mapping_by_external, &kv, 0)) + { + clib_warning ("static_mapping_by_external key del failed"); + return VNET_API_ERROR_UNSPECIFIED; + } + + m_key.port = clib_host_to_net_u16 (m->external_port); + kv.key = m_key.as_u64; + if (clib_bihash_add_del_8_8(&tsm->out2in, &kv, 0)) + { + clib_warning ("outi2in key del failed"); + return VNET_API_ERROR_UNSPECIFIED; + } + + vec_foreach (local, m->locals) + { + m_key.addr = local->addr; + m_key.port = local->port; + m_key.fib_index = m->fib_index; + kv.key = m_key.as_u64; + if (clib_bihash_add_del_8_8(&sm->static_mapping_by_local, &kv, 0)) + { + clib_warning ("static_mapping_by_local key del failed"); + return VNET_API_ERROR_UNSPECIFIED; + } + + m_key.port = clib_host_to_net_u16 (local->port); + kv.key = m_key.as_u64; + if (clib_bihash_add_del_8_8(&tsm->in2out, &kv, 0)) + { + clib_warning ("in2out key del failed"); + return VNET_API_ERROR_UNSPECIFIED; + } + } + vec_free(m->locals); + + pool_put (sm->static_mappings, m); + } + + return 0; +} + +int snat_del_address (snat_main_t *sm, ip4_address_t addr, u8 delete_sm) +{ + snat_address_t *a = 0; + snat_session_t *ses; + u32 *ses_to_be_removed = 0, *ses_index; + clib_bihash_kv_8_8_t kv, value; + snat_user_key_t user_key; + snat_user_t *u; + snat_main_per_thread_data_t *tsm; + snat_static_mapping_t *m; + snat_interface_t *interface; + int i; + + /* Find SNAT address */ + for (i=0; i < vec_len (sm->addresses); i++) + { + if (sm->addresses[i].addr.as_u32 == addr.as_u32) + { + a = sm->addresses + i; + break; + } + } + if (!a) + return VNET_API_ERROR_NO_SUCH_ENTRY; + + if (delete_sm) + { + pool_foreach (m, sm->static_mappings, + ({ + if (m->external_addr.as_u32 == addr.as_u32) + (void) snat_add_static_mapping (m->local_addr, m->external_addr, + m->local_port, m->external_port, + m->vrf_id, m->addr_only, ~0, + m->proto, 0); + })); + } + else + { + /* Check if address is used in some static mapping */ + if (is_snat_address_used_in_static_mapping(sm, addr)) + { + clib_warning ("address used in static mapping"); + return VNET_API_ERROR_UNSPECIFIED; + } + } + + if (a->fib_index != ~0) + fib_table_unlock(a->fib_index, FIB_PROTOCOL_IP4, + FIB_SOURCE_PLUGIN_HI); + + /* Delete sessions using address */ + if (a->busy_tcp_ports || a->busy_udp_ports || a->busy_icmp_ports) + { + vec_foreach (tsm, sm->per_thread_data) + { + pool_foreach (ses, tsm->sessions, ({ + if (ses->out2in.addr.as_u32 == addr.as_u32) + { + if (snat_is_unk_proto_session (ses)) + { + clib_bihash_kv_16_8_t up_kv; + nat_ed_ses_key_t up_key; + up_key.l_addr = ses->in2out.addr; + up_key.r_addr = ses->ext_host_addr; + up_key.fib_index = ses->in2out.fib_index; + up_key.proto = ses->in2out.port; + up_key.rsvd = 0; + up_key.l_port = 0; + up_kv.key[0] = up_key.as_u64[0]; + up_kv.key[1] = up_key.as_u64[1]; + if (clib_bihash_add_del_16_8 (&sm->in2out_ed, + &up_kv, 0)) + clib_warning ("in2out key del failed"); + + up_key.l_addr = ses->out2in.addr; + up_key.fib_index = ses->out2in.fib_index; + up_kv.key[0] = up_key.as_u64[0]; + up_kv.key[1] = up_key.as_u64[1]; + if (clib_bihash_add_del_16_8 (&sm->out2in_ed, + &up_kv, 0)) + clib_warning ("out2in key del failed"); + } + else + { + /* log NAT event */ + snat_ipfix_logging_nat44_ses_delete(ses->in2out.addr.as_u32, + ses->out2in.addr.as_u32, + ses->in2out.protocol, + ses->in2out.port, + ses->out2in.port, + ses->in2out.fib_index); + kv.key = ses->in2out.as_u64; + clib_bihash_add_del_8_8 (&tsm->in2out, &kv, 0); + kv.key = ses->out2in.as_u64; + clib_bihash_add_del_8_8 (&tsm->out2in, &kv, 0); + } + vec_add1 (ses_to_be_removed, ses - tsm->sessions); + clib_dlist_remove (tsm->list_pool, ses->per_user_index); + user_key.addr = ses->in2out.addr; + user_key.fib_index = ses->in2out.fib_index; + kv.key = user_key.as_u64; + if (!clib_bihash_search_8_8 (&tsm->user_hash, &kv, &value)) + { + u = pool_elt_at_index (tsm->users, value.value); + u->nsessions--; + } + } + })); + + vec_foreach (ses_index, ses_to_be_removed) + pool_put_index (tsm->sessions, ses_index[0]); + + vec_free (ses_to_be_removed); + } + } + + vec_del1 (sm->addresses, i); + + /* Delete external address from FIB */ + pool_foreach (interface, sm->interfaces, + ({ + if (interface->is_inside) + continue; + + snat_add_del_addr_to_fib(&addr, 32, interface->sw_if_index, 0); + break; + })); + pool_foreach (interface, sm->output_feature_interfaces, + ({ + if (interface->is_inside) + continue; + + snat_add_del_addr_to_fib(&addr, 32, interface->sw_if_index, 0); + break; + })); + + return 0; +} + +int snat_interface_add_del (u32 sw_if_index, u8 is_inside, int is_del) +{ + snat_main_t *sm = &snat_main; + snat_interface_t *i; + const char * feature_name; + snat_address_t * ap; + snat_static_mapping_t * m; + snat_det_map_t * dm; + + if (sm->static_mapping_only && !(sm->static_mapping_connection_tracking)) + feature_name = is_inside ? "nat44-in2out-fast" : "nat44-out2in-fast"; + else + { + if (sm->num_workers > 1 && !sm->deterministic) + feature_name = is_inside ? "nat44-in2out-worker-handoff" : "nat44-out2in-worker-handoff"; + else if (sm->deterministic) + feature_name = is_inside ? "nat44-det-in2out" : "nat44-det-out2in"; + else + feature_name = is_inside ? "nat44-in2out" : "nat44-out2in"; + } + + vnet_feature_enable_disable ("ip4-unicast", feature_name, sw_if_index, + !is_del, 0, 0); + + if (sm->fq_in2out_index == ~0 && !sm->deterministic && sm->num_workers > 1) + sm->fq_in2out_index = vlib_frame_queue_main_init (sm->in2out_node_index, 0); + + if (sm->fq_out2in_index == ~0 && !sm->deterministic && sm->num_workers > 1) + sm->fq_out2in_index = vlib_frame_queue_main_init (sm->out2in_node_index, 0); + + pool_foreach (i, sm->interfaces, + ({ + if (i->sw_if_index == sw_if_index) + { + if (is_del) + pool_put (sm->interfaces, i); + else + return VNET_API_ERROR_VALUE_EXIST; + + goto fib; + } + })); + + if (is_del) + return VNET_API_ERROR_NO_SUCH_ENTRY; + + pool_get (sm->interfaces, i); + i->sw_if_index = sw_if_index; + i->is_inside = is_inside; + + /* Add/delete external addresses to FIB */ +fib: + if (is_inside) + return 0; + + vec_foreach (ap, sm->addresses) + snat_add_del_addr_to_fib(&ap->addr, 32, sw_if_index, !is_del); + + pool_foreach (m, sm->static_mappings, + ({ + if (!(m->addr_only)) + continue; + + snat_add_del_addr_to_fib(&m->external_addr, 32, sw_if_index, !is_del); + })); + + pool_foreach (dm, sm->det_maps, + ({ + snat_add_del_addr_to_fib(&dm->out_addr, dm->out_plen, sw_if_index, !is_del); + })); + + return 0; +} + +int snat_interface_add_del_output_feature (u32 sw_if_index, + u8 is_inside, + int is_del) +{ + snat_main_t *sm = &snat_main; + snat_interface_t *i; + snat_address_t * ap; + snat_static_mapping_t * m; + + if (sm->deterministic || + (sm->static_mapping_only && !(sm->static_mapping_connection_tracking))) + return VNET_API_ERROR_UNSUPPORTED; + + if (is_inside) + { + vnet_feature_enable_disable ("ip4-unicast", "nat44-hairpin-dst", + sw_if_index, !is_del, 0, 0); + vnet_feature_enable_disable ("ip4-output", "nat44-hairpin-src", + sw_if_index, !is_del, 0, 0); + goto fq; + } + + if (sm->num_workers > 1) + { + vnet_feature_enable_disable ("ip4-unicast", "nat44-out2in-worker-handoff", + sw_if_index, !is_del, 0, 0); + vnet_feature_enable_disable ("ip4-output", + "nat44-in2out-output-worker-handoff", + sw_if_index, !is_del, 0, 0); + } + else + { + vnet_feature_enable_disable ("ip4-unicast", "nat44-out2in", sw_if_index, + !is_del, 0, 0); + vnet_feature_enable_disable ("ip4-output", "nat44-in2out-output", + sw_if_index, !is_del, 0, 0); + } + +fq: + if (sm->fq_in2out_output_index == ~0 && sm->num_workers > 1) + sm->fq_in2out_output_index = + vlib_frame_queue_main_init (sm->in2out_output_node_index, 0); + + if (sm->fq_out2in_index == ~0 && sm->num_workers > 1) + sm->fq_out2in_index = vlib_frame_queue_main_init (sm->out2in_node_index, 0); + + pool_foreach (i, sm->output_feature_interfaces, + ({ + if (i->sw_if_index == sw_if_index) + { + if (is_del) + pool_put (sm->output_feature_interfaces, i); + else + return VNET_API_ERROR_VALUE_EXIST; + + goto fib; + } + })); + + if (is_del) + return VNET_API_ERROR_NO_SUCH_ENTRY; + + pool_get (sm->output_feature_interfaces, i); + i->sw_if_index = sw_if_index; + i->is_inside = is_inside; + + /* Add/delete external addresses to FIB */ +fib: + if (is_inside) + return 0; + + vec_foreach (ap, sm->addresses) + snat_add_del_addr_to_fib(&ap->addr, 32, sw_if_index, !is_del); + + pool_foreach (m, sm->static_mappings, + ({ + if (!(m->addr_only)) + continue; + + snat_add_del_addr_to_fib(&m->external_addr, 32, sw_if_index, !is_del); + })); + + return 0; +} + +int snat_set_workers (uword * bitmap) +{ + snat_main_t *sm = &snat_main; + int i, j = 0; + + if (sm->num_workers < 2) + return VNET_API_ERROR_FEATURE_DISABLED; + + if (clib_bitmap_last_set (bitmap) >= sm->num_workers) + return VNET_API_ERROR_INVALID_WORKER; + + vec_free (sm->workers); + clib_bitmap_foreach (i, bitmap, + ({ + vec_add1(sm->workers, i); + sm->per_thread_data[i].snat_thread_index = j; + j++; + })); + + sm->port_per_thread = (0xffff - 1024) / _vec_len (sm->workers); + sm->num_snat_thread = _vec_len (sm->workers); + + return 0; +} + + +static void +snat_ip4_add_del_interface_address_cb (ip4_main_t * im, + uword opaque, + u32 sw_if_index, + ip4_address_t * address, + u32 address_length, + u32 if_address_index, + u32 is_delete); + +static clib_error_t * snat_init (vlib_main_t * vm) +{ + snat_main_t * sm = &snat_main; + clib_error_t * error = 0; + ip4_main_t * im = &ip4_main; + ip_lookup_main_t * lm = &im->lookup_main; + uword *p; + vlib_thread_registration_t *tr; + vlib_thread_main_t *tm = vlib_get_thread_main (); + uword *bitmap = 0; + u32 i; + ip4_add_del_interface_address_callback_t cb4; + + sm->vlib_main = vm; + sm->vnet_main = vnet_get_main(); + sm->ip4_main = im; + sm->ip4_lookup_main = lm; + sm->api_main = &api_main; + sm->first_worker_index = 0; + sm->next_worker = 0; + sm->num_workers = 0; + sm->num_snat_thread = 1; + sm->workers = 0; + sm->port_per_thread = 0xffff - 1024; + sm->fq_in2out_index = ~0; + sm->fq_out2in_index = ~0; + sm->udp_timeout = SNAT_UDP_TIMEOUT; + sm->tcp_established_timeout = SNAT_TCP_ESTABLISHED_TIMEOUT; + sm->tcp_transitory_timeout = SNAT_TCP_TRANSITORY_TIMEOUT; + sm->icmp_timeout = SNAT_ICMP_TIMEOUT; + + p = hash_get_mem (tm->thread_registrations_by_name, "workers"); + if (p) + { + tr = (vlib_thread_registration_t *) p[0]; + if (tr) + { + sm->num_workers = tr->count; + sm->first_worker_index = tr->first_index; + } + } + + vec_validate (sm->per_thread_data, tm->n_vlib_mains - 1); + + /* Use all available workers by default */ + if (sm->num_workers > 1) + { + for (i=0; i < sm->num_workers; i++) + bitmap = clib_bitmap_set (bitmap, i, 1); + snat_set_workers(bitmap); + clib_bitmap_free (bitmap); + } + else + { + sm->per_thread_data[0].snat_thread_index = 0; + } + + error = snat_api_init(vm, sm); + if (error) + return error; + + /* Set up the interface address add/del callback */ + cb4.function = snat_ip4_add_del_interface_address_cb; + cb4.function_opaque = 0; + + vec_add1 (im->add_del_interface_address_callbacks, cb4); + + /* Init IPFIX logging */ + snat_ipfix_logging_init(vm); + + error = nat64_init(vm); + + return error; +} + +VLIB_INIT_FUNCTION (snat_init); + +void snat_free_outside_address_and_port (snat_main_t * sm, + u32 thread_index, + snat_session_key_t * k, + u32 address_index) +{ + snat_address_t *a; + u16 port_host_byte_order = clib_net_to_host_u16 (k->port); + + ASSERT (address_index < vec_len (sm->addresses)); + + a = sm->addresses + address_index; + + switch (k->protocol) + { +#define _(N, i, n, s) \ + case SNAT_PROTOCOL_##N: \ + ASSERT (clib_bitmap_get_no_check (a->busy_##n##_port_bitmap, \ + port_host_byte_order) == 1); \ + clib_bitmap_set_no_check (a->busy_##n##_port_bitmap, \ + port_host_byte_order, 0); \ + a->busy_##n##_ports--; \ + a->busy_##n##_ports_per_thread[thread_index]--; \ + break; + foreach_snat_protocol +#undef _ + default: + clib_warning("unknown_protocol"); + return; + } +} + +/** + * @brief Match NAT44 static mapping. + * + * @param sm NAT main. + * @param match Address and port to match. + * @param mapping External or local address and port of the matched mapping. + * @param by_external If 0 match by local address otherwise match by external + * address. + * @param is_addr_only If matched mapping is address only + * + * @returns 0 if match found otherwise 1. + */ +int snat_static_mapping_match (snat_main_t * sm, + snat_session_key_t match, + snat_session_key_t * mapping, + u8 by_external, + u8 *is_addr_only) +{ + clib_bihash_kv_8_8_t kv, value; + snat_static_mapping_t *m; + snat_session_key_t m_key; + clib_bihash_8_8_t *mapping_hash = &sm->static_mapping_by_local; + u32 rand, lo = 0, hi, mid; + + if (by_external) + mapping_hash = &sm->static_mapping_by_external; + + m_key.addr = match.addr; + m_key.port = clib_net_to_host_u16 (match.port); + m_key.protocol = match.protocol; + m_key.fib_index = match.fib_index; + + kv.key = m_key.as_u64; + + if (clib_bihash_search_8_8 (mapping_hash, &kv, &value)) + { + /* Try address only mapping */ + m_key.port = 0; + m_key.protocol = 0; + kv.key = m_key.as_u64; + if (clib_bihash_search_8_8 (mapping_hash, &kv, &value)) + return 1; + } + + m = pool_elt_at_index (sm->static_mappings, value.value); + + if (by_external) + { + if (vec_len (m->locals)) + { + hi = vec_len (m->locals) - 1; + rand = 1 + (random_u32 (&sm->random_seed) % m->locals[hi].prefix); + while (lo < hi) + { + mid = ((hi - lo) >> 1) + lo; + (rand > m->locals[mid].prefix) ? (lo = mid + 1) : (hi = mid); + } + if (!(m->locals[lo].prefix >= rand)) + return 1; + mapping->addr = m->locals[lo].addr; + mapping->port = clib_host_to_net_u16 (m->locals[lo].port); + } + else + { + mapping->addr = m->local_addr; + /* Address only mapping doesn't change port */ + mapping->port = m->addr_only ? match.port + : clib_host_to_net_u16 (m->local_port); + } + mapping->fib_index = m->fib_index; + mapping->protocol = m->proto; + } + else + { + mapping->addr = m->external_addr; + /* Address only mapping doesn't change port */ + mapping->port = m->addr_only ? match.port + : clib_host_to_net_u16 (m->external_port); + mapping->fib_index = sm->outside_fib_index; + } + + if (PREDICT_FALSE(is_addr_only != 0)) + *is_addr_only = m->addr_only; + + return 0; +} + +static_always_inline u16 +snat_random_port (snat_main_t * sm, u16 min, u16 max) +{ + return min + random_u32 (&sm->random_seed) / + (random_u32_max() / (max - min + 1) + 1); +} + +int snat_alloc_outside_address_and_port (snat_main_t * sm, + u32 fib_index, + u32 thread_index, + snat_session_key_t * k, + u32 * address_indexp) +{ + int i; + snat_address_t *a; + u32 portnum; + + for (i = 0; i < vec_len (sm->addresses); i++) + { + a = sm->addresses + i; + if (sm->vrf_mode && a->fib_index != ~0 && a->fib_index != fib_index) + continue; + switch (k->protocol) + { +#define _(N, j, n, s) \ + case SNAT_PROTOCOL_##N: \ + if (a->busy_##n##_ports_per_thread[thread_index] < sm->port_per_thread) \ + { \ + while (1) \ + { \ + portnum = (sm->port_per_thread * \ + sm->per_thread_data[thread_index].snat_thread_index) + \ + snat_random_port(sm, 1, sm->port_per_thread) + 1024; \ + if (clib_bitmap_get_no_check (a->busy_##n##_port_bitmap, portnum)) \ + continue; \ + clib_bitmap_set_no_check (a->busy_##n##_port_bitmap, portnum, 1); \ + a->busy_##n##_ports_per_thread[thread_index]++; \ + a->busy_##n##_ports++; \ + k->addr = a->addr; \ + k->port = clib_host_to_net_u16(portnum); \ + *address_indexp = i; \ + return 0; \ + } \ + } \ + break; + foreach_snat_protocol +#undef _ + default: + clib_warning("unknown protocol"); + return 1; + } + + } + /* Totally out of translations to use... */ + snat_ipfix_logging_addresses_exhausted(0); + return 1; +} + + +static clib_error_t * +add_address_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + snat_main_t * sm = &snat_main; + ip4_address_t start_addr, end_addr, this_addr; + u32 start_host_order, end_host_order; + u32 vrf_id = ~0; + int i, count; + int is_add = 1; + int rv = 0; + clib_error_t *error = 0; + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "%U - %U", + unformat_ip4_address, &start_addr, + unformat_ip4_address, &end_addr)) + ; + else if (unformat (line_input, "tenant-vrf %u", &vrf_id)) + ; + else if (unformat (line_input, "%U", unformat_ip4_address, &start_addr)) + end_addr = start_addr; + else if (unformat (line_input, "del")) + is_add = 0; + else + { + error = clib_error_return (0, "unknown input '%U'", + format_unformat_error, line_input); + goto done; + } + } + + if (sm->static_mapping_only) + { + error = clib_error_return (0, "static mapping only mode"); + goto done; + } + + start_host_order = clib_host_to_net_u32 (start_addr.as_u32); + end_host_order = clib_host_to_net_u32 (end_addr.as_u32); + + if (end_host_order < start_host_order) + { + error = clib_error_return (0, "end address less than start address"); + goto done; + } + + count = (end_host_order - start_host_order) + 1; + + if (count > 1024) + clib_warning ("%U - %U, %d addresses...", + format_ip4_address, &start_addr, + format_ip4_address, &end_addr, + count); + + this_addr = start_addr; + + for (i = 0; i < count; i++) + { + if (is_add) + snat_add_address (sm, &this_addr, vrf_id); + else + rv = snat_del_address (sm, this_addr, 0); + + switch (rv) + { + case VNET_API_ERROR_NO_SUCH_ENTRY: + error = clib_error_return (0, "S-NAT address not exist."); + goto done; + case VNET_API_ERROR_UNSPECIFIED: + error = clib_error_return (0, "S-NAT address used in static mapping."); + goto done; + default: + break; + } + + increment_v4_address (&this_addr); + } + +done: + unformat_free (line_input); + + return error; +} + +VLIB_CLI_COMMAND (add_address_command, static) = { + .path = "nat44 add address", + .short_help = "nat44 add address <ip4-range-start> [- <ip4-range-end>] " + "[tenant-vrf <vrf-id>] [del]", + .function = add_address_command_fn, +}; + +static clib_error_t * +snat_feature_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + vnet_main_t * vnm = vnet_get_main(); + clib_error_t * error = 0; + u32 sw_if_index; + u32 * inside_sw_if_indices = 0; + u32 * outside_sw_if_indices = 0; + u8 is_output_feature = 0; + int is_del = 0; + int i; + + sw_if_index = ~0; + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "in %U", unformat_vnet_sw_interface, + vnm, &sw_if_index)) + vec_add1 (inside_sw_if_indices, sw_if_index); + else if (unformat (line_input, "out %U", unformat_vnet_sw_interface, + vnm, &sw_if_index)) + vec_add1 (outside_sw_if_indices, sw_if_index); + else if (unformat (line_input, "output-feature")) + is_output_feature = 1; + else if (unformat (line_input, "del")) + is_del = 1; + else + { + error = clib_error_return (0, "unknown input '%U'", + format_unformat_error, line_input); + goto done; + } + } + + if (vec_len (inside_sw_if_indices)) + { + for (i = 0; i < vec_len(inside_sw_if_indices); i++) + { + sw_if_index = inside_sw_if_indices[i]; + if (is_output_feature) + { + if (snat_interface_add_del_output_feature (sw_if_index, 1, is_del)) + { + error = clib_error_return (0, "%s %U failed", + is_del ? "del" : "add", + format_vnet_sw_interface_name, vnm, + vnet_get_sw_interface (vnm, + sw_if_index)); + goto done; + } + } + else + { + if (snat_interface_add_del (sw_if_index, 1, is_del)) + { + error = clib_error_return (0, "%s %U failed", + is_del ? "del" : "add", + format_vnet_sw_interface_name, vnm, + vnet_get_sw_interface (vnm, + sw_if_index)); + goto done; + } + } + } + } + + if (vec_len (outside_sw_if_indices)) + { + for (i = 0; i < vec_len(outside_sw_if_indices); i++) + { + sw_if_index = outside_sw_if_indices[i]; + if (is_output_feature) + { + if (snat_interface_add_del_output_feature (sw_if_index, 0, is_del)) + { + error = clib_error_return (0, "%s %U failed", + is_del ? "del" : "add", + format_vnet_sw_interface_name, vnm, + vnet_get_sw_interface (vnm, + sw_if_index)); + goto done; + } + } + else + { + if (snat_interface_add_del (sw_if_index, 0, is_del)) + { + error = clib_error_return (0, "%s %U failed", + is_del ? "del" : "add", + format_vnet_sw_interface_name, vnm, + vnet_get_sw_interface (vnm, + sw_if_index)); + goto done; + } + } + } + } + +done: + unformat_free (line_input); + vec_free (inside_sw_if_indices); + vec_free (outside_sw_if_indices); + + return error; +} + +VLIB_CLI_COMMAND (set_interface_snat_command, static) = { + .path = "set interface nat44", + .function = snat_feature_command_fn, + .short_help = "set interface nat44 in <intfc> out <intfc> [output-feature] " + "[del]", +}; + +uword +unformat_snat_protocol (unformat_input_t * input, va_list * args) +{ + u32 *r = va_arg (*args, u32 *); + + if (0); +#define _(N, i, n, s) else if (unformat (input, s)) *r = SNAT_PROTOCOL_##N; + foreach_snat_protocol +#undef _ + else + return 0; + return 1; +} + +u8 * +format_snat_protocol (u8 * s, va_list * args) +{ + u32 i = va_arg (*args, u32); + u8 *t = 0; + + switch (i) + { +#define _(N, j, n, str) case SNAT_PROTOCOL_##N: t = (u8 *) str; break; + foreach_snat_protocol +#undef _ + default: + s = format (s, "unknown"); + return s; + } + s = format (s, "%s", t); + return s; +} + +static clib_error_t * +add_static_mapping_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + clib_error_t * error = 0; + ip4_address_t l_addr, e_addr; + u32 l_port = 0, e_port = 0, vrf_id = ~0; + int is_add = 1; + int addr_only = 1; + u32 sw_if_index = ~0; + vnet_main_t * vnm = vnet_get_main(); + int rv; + snat_protocol_t proto; + u8 proto_set = 0; + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "local %U %u", unformat_ip4_address, &l_addr, + &l_port)) + addr_only = 0; + else if (unformat (line_input, "local %U", unformat_ip4_address, &l_addr)) + ; + else if (unformat (line_input, "external %U %u", unformat_ip4_address, + &e_addr, &e_port)) + addr_only = 0; + else if (unformat (line_input, "external %U", unformat_ip4_address, + &e_addr)) + ; + else if (unformat (line_input, "external %U %u", + unformat_vnet_sw_interface, vnm, &sw_if_index, + &e_port)) + addr_only = 0; + + else if (unformat (line_input, "external %U", + unformat_vnet_sw_interface, vnm, &sw_if_index)) + ; + else if (unformat (line_input, "vrf %u", &vrf_id)) + ; + else if (unformat (line_input, "%U", unformat_snat_protocol, &proto)) + proto_set = 1; + else if (unformat (line_input, "del")) + is_add = 0; + else + { + error = clib_error_return (0, "unknown input: '%U'", + format_unformat_error, line_input); + goto done; + } + } + + if (!addr_only && !proto_set) + { + error = clib_error_return (0, "missing protocol"); + goto done; + } + + rv = snat_add_static_mapping(l_addr, e_addr, (u16) l_port, (u16) e_port, + vrf_id, addr_only, sw_if_index, proto, is_add); + + switch (rv) + { + case VNET_API_ERROR_INVALID_VALUE: + error = clib_error_return (0, "External port already in use."); + goto done; + case VNET_API_ERROR_NO_SUCH_ENTRY: + if (is_add) + error = clib_error_return (0, "External addres must be allocated."); + else + error = clib_error_return (0, "Mapping not exist."); + goto done; + case VNET_API_ERROR_NO_SUCH_FIB: + error = clib_error_return (0, "No such VRF id."); + goto done; + case VNET_API_ERROR_VALUE_EXIST: + error = clib_error_return (0, "Mapping already exist."); + goto done; + default: + break; + } + +done: + unformat_free (line_input); + + return error; +} + +/*? + * @cliexpar + * @cliexstart{snat add static mapping} + * Static mapping allows hosts on the external network to initiate connection + * to to the local network host. + * To create static mapping between local host address 10.0.0.3 port 6303 and + * external address 4.4.4.4 port 3606 for TCP protocol use: + * vpp# nat44 add static mapping tcp local 10.0.0.3 6303 external 4.4.4.4 3606 + * If not runnig "static mapping only" NAT plugin mode use before: + * vpp# nat44 add address 4.4.4.4 + * To create static mapping between local and external address use: + * vpp# nat44 add static mapping local 10.0.0.3 external 4.4.4.4 + * @cliexend +?*/ +VLIB_CLI_COMMAND (add_static_mapping_command, static) = { + .path = "nat44 add static mapping", + .function = add_static_mapping_command_fn, + .short_help = + "nat44 add static mapping tcp|udp|icmp local <addr> [<port>] external <addr> [<port>] [vrf <table-id>] [del]", +}; + +static clib_error_t * +add_lb_static_mapping_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + clib_error_t * error = 0; + ip4_address_t l_addr, e_addr; + u32 l_port = 0, e_port = 0, vrf_id = 0, probability = 0; + int is_add = 1; + int rv; + snat_protocol_t proto; + u8 proto_set = 0; + nat44_lb_addr_port_t *locals = 0, local; + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "local %U:%u probability %u", + unformat_ip4_address, &l_addr, &l_port, &probability)) + { + memset (&local, 0, sizeof (local)); + local.addr = l_addr; + local.port = (u16) l_port; + local.probability = (u8) probability; + vec_add1 (locals, local); + } + else if (unformat (line_input, "external %U:%u", unformat_ip4_address, + &e_addr, &e_port)) + ; + else if (unformat (line_input, "vrf %u", &vrf_id)) + ; + else if (unformat (line_input, "protocol %U", unformat_snat_protocol, + &proto)) + proto_set = 1; + else if (unformat (line_input, "del")) + is_add = 0; + else + { + error = clib_error_return (0, "unknown input: '%U'", + format_unformat_error, line_input); + goto done; + } + } + + if (vec_len (locals) < 2) + { + error = clib_error_return (0, "at least two local must be set"); + goto done; + } + + if (!proto_set) + { + error = clib_error_return (0, "missing protocol"); + goto done; + } + + rv = nat44_add_del_lb_static_mapping (e_addr, (u16) e_port, proto, vrf_id, + locals, is_add); + + switch (rv) + { + case VNET_API_ERROR_INVALID_VALUE: + error = clib_error_return (0, "External port already in use."); + goto done; + case VNET_API_ERROR_NO_SUCH_ENTRY: + if (is_add) + error = clib_error_return (0, "External addres must be allocated."); + else + error = clib_error_return (0, "Mapping not exist."); + goto done; + case VNET_API_ERROR_VALUE_EXIST: + error = clib_error_return (0, "Mapping already exist."); + goto done; + default: + break; + } + +done: + unformat_free (line_input); + vec_free (locals); + + return error; +} + +VLIB_CLI_COMMAND (add_lb_static_mapping_command, static) = { + .path = "nat44 add load-balancing static mapping", + .function = add_lb_static_mapping_command_fn, + .short_help = + "nat44 add load-balancing static mapping protocol tcp|udp external <addr>:<port> local <addr>:<port> probability <n> [vrf <table-id>] [del]", +}; + +static clib_error_t * +set_workers_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + uword *bitmap = 0; + int rv = 0; + clib_error_t *error = 0; + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "%U", unformat_bitmap_list, &bitmap)) + ; + else + { + error = clib_error_return (0, "unknown input '%U'", + format_unformat_error, line_input); + goto done; + } + } + + if (bitmap == 0) + { + error = clib_error_return (0, "List of workers must be specified."); + goto done; + } + + rv = snat_set_workers(bitmap); + + clib_bitmap_free (bitmap); + + switch (rv) + { + case VNET_API_ERROR_INVALID_WORKER: + error = clib_error_return (0, "Invalid worker(s)."); + goto done; + case VNET_API_ERROR_FEATURE_DISABLED: + error = clib_error_return (0, + "Supported only if 2 or more workes available."); + goto done; + default: + break; + } + +done: + unformat_free (line_input); + + return error; +} + +/*? + * @cliexpar + * @cliexstart{set snat workers} + * Set NAT workers if 2 or more workers available, use: + * vpp# set snat workers 0-2,5 + * @cliexend +?*/ +VLIB_CLI_COMMAND (set_workers_command, static) = { + .path = "set nat workers", + .function = set_workers_command_fn, + .short_help = + "set nat workers <workers-list>", +}; + +static clib_error_t * +snat_ipfix_logging_enable_disable_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + u32 domain_id = 0; + u32 src_port = 0; + u8 enable = 1; + int rv = 0; + clib_error_t *error = 0; + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "domain %d", &domain_id)) + ; + else if (unformat (line_input, "src-port %d", &src_port)) + ; + else if (unformat (line_input, "disable")) + enable = 0; + else + { + error = clib_error_return (0, "unknown input '%U'", + format_unformat_error, line_input); + goto done; + } + } + + rv = snat_ipfix_logging_enable_disable (enable, domain_id, (u16) src_port); + + if (rv) + { + error = clib_error_return (0, "ipfix logging enable failed"); + goto done; + } + +done: + unformat_free (line_input); + + return error; +} + +/*? + * @cliexpar + * @cliexstart{snat ipfix logging} + * To enable NAT IPFIX logging use: + * vpp# nat ipfix logging + * To set IPFIX exporter use: + * vpp# set ipfix exporter collector 10.10.10.3 src 10.10.10.1 + * @cliexend +?*/ +VLIB_CLI_COMMAND (snat_ipfix_logging_enable_disable_command, static) = { + .path = "nat ipfix logging", + .function = snat_ipfix_logging_enable_disable_command_fn, + .short_help = "nat ipfix logging [domain <domain-id>] [src-port <port>] [disable]", +}; + +static u32 +snat_get_worker_in2out_cb (ip4_header_t * ip0, u32 rx_fib_index0) +{ + snat_main_t *sm = &snat_main; + u32 next_worker_index = 0; + u32 hash; + + next_worker_index = sm->first_worker_index; + hash = ip0->src_address.as_u32 + (ip0->src_address.as_u32 >> 8) + + (ip0->src_address.as_u32 >> 16) + (ip0->src_address.as_u32 >>24); + + if (PREDICT_TRUE (is_pow2 (_vec_len (sm->workers)))) + next_worker_index += sm->workers[hash & (_vec_len (sm->workers) - 1)]; + else + next_worker_index += sm->workers[hash % _vec_len (sm->workers)]; + + return next_worker_index; +} + +static u32 +snat_get_worker_out2in_cb (ip4_header_t * ip0, u32 rx_fib_index0) +{ + snat_main_t *sm = &snat_main; + udp_header_t *udp; + u16 port; + snat_session_key_t m_key; + clib_bihash_kv_8_8_t kv, value; + snat_static_mapping_t *m; + nat_ed_ses_key_t key; + clib_bihash_kv_16_8_t s_kv, s_value; + snat_main_per_thread_data_t *tsm; + snat_session_t *s; + int i; + u32 proto; + + /* first try static mappings without port */ + if (PREDICT_FALSE (pool_elts (sm->static_mappings))) + { + m_key.addr = ip0->dst_address; + m_key.port = 0; + m_key.protocol = 0; + m_key.fib_index = rx_fib_index0; + kv.key = m_key.as_u64; + if (!clib_bihash_search_8_8 (&sm->static_mapping_by_external, &kv, &value)) + { + m = pool_elt_at_index (sm->static_mappings, value.value); + return m->worker_index; + } + } + + proto = ip_proto_to_snat_proto (ip0->protocol); + udp = ip4_next_header (ip0); + port = udp->dst_port; + + /* unknown protocol */ + if (PREDICT_FALSE (proto == ~0)) + { + key.l_addr = ip0->dst_address; + key.r_addr = ip0->src_address; + key.fib_index = rx_fib_index0; + key.proto = ip0->protocol; + key.rsvd = 0; + key.l_port = 0; + s_kv.key[0] = key.as_u64[0]; + s_kv.key[1] = key.as_u64[1]; + + if (!clib_bihash_search_16_8 (&sm->out2in_ed, &s_kv, &s_value)) + { + for (i = 0; i < _vec_len (sm->per_thread_data); i++) + { + tsm = vec_elt_at_index (sm->per_thread_data, i); + if (!pool_is_free_index(tsm->sessions, s_value.value)) + { + s = pool_elt_at_index (tsm->sessions, s_value.value); + if (s->out2in.addr.as_u32 == ip0->dst_address.as_u32 && + s->out2in.port == ip0->protocol && + snat_is_unk_proto_session (s)) + return i; + } + } + } + + /* if no session use current thread */ + return vlib_get_thread_index (); + } + + if (PREDICT_FALSE (ip0->protocol == IP_PROTOCOL_ICMP)) + { + icmp46_header_t * icmp = (icmp46_header_t *) udp; + icmp_echo_header_t *echo = (icmp_echo_header_t *)(icmp + 1); + if (!icmp_is_error_message (icmp)) + port = echo->identifier; + else + { + ip4_header_t *inner_ip = (ip4_header_t *)(echo + 1); + proto = ip_proto_to_snat_proto (inner_ip->protocol); + void *l4_header = ip4_next_header (inner_ip); + switch (proto) + { + case SNAT_PROTOCOL_ICMP: + icmp = (icmp46_header_t*)l4_header; + echo = (icmp_echo_header_t *)(icmp + 1); + port = echo->identifier; + break; + case SNAT_PROTOCOL_UDP: + case SNAT_PROTOCOL_TCP: + port = ((tcp_udp_header_t*)l4_header)->src_port; + break; + default: + return vlib_get_thread_index (); + } + } + } + + /* try static mappings with port */ + if (PREDICT_FALSE (pool_elts (sm->static_mappings))) + { + m_key.addr = ip0->dst_address; + m_key.port = clib_net_to_host_u16 (port); + m_key.protocol = proto; + m_key.fib_index = rx_fib_index0; + kv.key = m_key.as_u64; + if (!clib_bihash_search_8_8 (&sm->static_mapping_by_external, &kv, &value)) + { + m = pool_elt_at_index (sm->static_mappings, value.value); + return m->worker_index; + } + } + + /* worker by outside port */ + return (u32) ((clib_net_to_host_u16 (port) - 1024) / sm->port_per_thread); +} + +static clib_error_t * +snat_config (vlib_main_t * vm, unformat_input_t * input) +{ + snat_main_t * sm = &snat_main; + u32 translation_buckets = 1024; + u32 translation_memory_size = 128<<20; + u32 user_buckets = 128; + u32 user_memory_size = 64<<20; + u32 max_translations_per_user = 100; + u32 outside_vrf_id = 0; + u32 inside_vrf_id = 0; + u32 static_mapping_buckets = 1024; + u32 static_mapping_memory_size = 64<<20; + u8 static_mapping_only = 0; + u8 static_mapping_connection_tracking = 0; + snat_main_per_thread_data_t *tsm; + + sm->deterministic = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "translation hash buckets %d", &translation_buckets)) + ; + else if (unformat (input, "translation hash memory %d", + &translation_memory_size)); + else if (unformat (input, "user hash buckets %d", &user_buckets)) + ; + else if (unformat (input, "user hash memory %d", + &user_memory_size)) + ; + else if (unformat (input, "max translations per user %d", + &max_translations_per_user)) + ; + else if (unformat (input, "outside VRF id %d", + &outside_vrf_id)) + ; + else if (unformat (input, "inside VRF id %d", + &inside_vrf_id)) + ; + else if (unformat (input, "static mapping only")) + { + static_mapping_only = 1; + if (unformat (input, "connection tracking")) + static_mapping_connection_tracking = 1; + } + else if (unformat (input, "deterministic")) + sm->deterministic = 1; + else + return clib_error_return (0, "unknown input '%U'", + format_unformat_error, input); + } + + /* for show commands, etc. */ + sm->translation_buckets = translation_buckets; + sm->translation_memory_size = translation_memory_size; + /* do not exceed load factor 10 */ + sm->max_translations = 10 * translation_buckets; + sm->user_buckets = user_buckets; + sm->user_memory_size = user_memory_size; + sm->max_translations_per_user = max_translations_per_user; + sm->outside_vrf_id = outside_vrf_id; + sm->outside_fib_index = fib_table_find_or_create_and_lock (FIB_PROTOCOL_IP4, + outside_vrf_id, + FIB_SOURCE_PLUGIN_HI); + sm->inside_vrf_id = inside_vrf_id; + sm->inside_fib_index = fib_table_find_or_create_and_lock (FIB_PROTOCOL_IP4, + inside_vrf_id, + FIB_SOURCE_PLUGIN_HI); + sm->static_mapping_only = static_mapping_only; + sm->static_mapping_connection_tracking = static_mapping_connection_tracking; + + if (sm->deterministic) + { + sm->in2out_node_index = snat_det_in2out_node.index; + sm->in2out_output_node_index = ~0; + sm->out2in_node_index = snat_det_out2in_node.index; + sm->icmp_match_in2out_cb = icmp_match_in2out_det; + sm->icmp_match_out2in_cb = icmp_match_out2in_det; + } + else + { + sm->worker_in2out_cb = snat_get_worker_in2out_cb; + sm->worker_out2in_cb = snat_get_worker_out2in_cb; + sm->in2out_node_index = snat_in2out_node.index; + sm->in2out_output_node_index = snat_in2out_output_node.index; + sm->out2in_node_index = snat_out2in_node.index; + if (!static_mapping_only || + (static_mapping_only && static_mapping_connection_tracking)) + { + sm->icmp_match_in2out_cb = icmp_match_in2out_slow; + sm->icmp_match_out2in_cb = icmp_match_out2in_slow; + + vec_foreach (tsm, sm->per_thread_data) + { + clib_bihash_init_8_8 (&tsm->in2out, "in2out", translation_buckets, + translation_memory_size); + + clib_bihash_init_8_8 (&tsm->out2in, "out2in", translation_buckets, + translation_memory_size); + + clib_bihash_init_8_8 (&tsm->user_hash, "users", user_buckets, + user_memory_size); + } + + clib_bihash_init_16_8 (&sm->in2out_ed, "in2out-ed", + translation_buckets, translation_memory_size); + + clib_bihash_init_16_8 (&sm->out2in_ed, "out2in-ed", + translation_buckets, translation_memory_size); + } + else + { + sm->icmp_match_in2out_cb = icmp_match_in2out_fast; + sm->icmp_match_out2in_cb = icmp_match_out2in_fast; + } + clib_bihash_init_8_8 (&sm->static_mapping_by_local, + "static_mapping_by_local", static_mapping_buckets, + static_mapping_memory_size); + + clib_bihash_init_8_8 (&sm->static_mapping_by_external, + "static_mapping_by_external", static_mapping_buckets, + static_mapping_memory_size); + } + + return 0; +} + +VLIB_CONFIG_FUNCTION (snat_config, "nat"); + +u8 * format_snat_session_state (u8 * s, va_list * args) +{ + u32 i = va_arg (*args, u32); + u8 *t = 0; + + switch (i) + { +#define _(v, N, str) case SNAT_SESSION_##N: t = (u8 *) str; break; + foreach_snat_session_state +#undef _ + default: + t = format (t, "unknown"); + } + s = format (s, "%s", t); + return s; +} + +u8 * format_snat_key (u8 * s, va_list * args) +{ + snat_session_key_t * key = va_arg (*args, snat_session_key_t *); + + s = format (s, "%U proto %U port %d fib %d", + format_ip4_address, &key->addr, + format_snat_protocol, key->protocol, + clib_net_to_host_u16 (key->port), key->fib_index); + return s; +} + +u8 * format_snat_session (u8 * s, va_list * args) +{ + snat_main_t * sm __attribute__((unused)) = va_arg (*args, snat_main_t *); + snat_session_t * sess = va_arg (*args, snat_session_t *); + + if (snat_is_unk_proto_session (sess)) + { + s = format (s, " i2o %U proto %u fib %u\n", + format_ip4_address, &sess->in2out.addr, sess->in2out.port, + sess->in2out.fib_index); + s = format (s, " o2i %U proto %u fib %u\n", + format_ip4_address, &sess->out2in.addr, sess->out2in.port, + sess->out2in.fib_index); + } + else + { + s = format (s, " i2o %U\n", format_snat_key, &sess->in2out); + s = format (s, " o2i %U\n", format_snat_key, &sess->out2in); + } + if (sess->ext_host_addr.as_u32) + s = format (s, " external host %U\n", + format_ip4_address, &sess->ext_host_addr); + s = format (s, " last heard %.2f\n", sess->last_heard); + s = format (s, " total pkts %d, total bytes %lld\n", + sess->total_pkts, sess->total_bytes); + if (snat_is_session_static (sess)) + s = format (s, " static translation\n"); + else + s = format (s, " dynamic translation\n"); + if (sess->flags & SNAT_SESSION_FLAG_LOAD_BALANCING) + s = format (s, " load-balancing\n"); + + return s; +} + +u8 * format_snat_user (u8 * s, va_list * args) +{ + snat_main_per_thread_data_t * sm = va_arg (*args, snat_main_per_thread_data_t *); + snat_user_t * u = va_arg (*args, snat_user_t *); + int verbose = va_arg (*args, int); + dlist_elt_t * head, * elt; + u32 elt_index, head_index; + u32 session_index; + snat_session_t * sess; + + s = format (s, "%U: %d dynamic translations, %d static translations\n", + format_ip4_address, &u->addr, u->nsessions, u->nstaticsessions); + + if (verbose == 0) + return s; + + if (u->nsessions || u->nstaticsessions) + { + head_index = u->sessions_per_user_list_head_index; + head = pool_elt_at_index (sm->list_pool, head_index); + + elt_index = head->next; + elt = pool_elt_at_index (sm->list_pool, elt_index); + session_index = elt->value; + + while (session_index != ~0) + { + sess = pool_elt_at_index (sm->sessions, session_index); + + s = format (s, " %U\n", format_snat_session, sm, sess); + + elt_index = elt->next; + elt = pool_elt_at_index (sm->list_pool, elt_index); + session_index = elt->value; + } + } + + return s; +} + +u8 * format_snat_static_mapping (u8 * s, va_list * args) +{ + snat_static_mapping_t *m = va_arg (*args, snat_static_mapping_t *); + nat44_lb_addr_port_t *local; + + if (m->addr_only) + s = format (s, "local %U external %U vrf %d", + format_ip4_address, &m->local_addr, + format_ip4_address, &m->external_addr, + m->vrf_id); + else + { + if (vec_len (m->locals)) + { + s = format (s, "%U vrf %d external %U:%d", + format_snat_protocol, m->proto, + m->vrf_id, + format_ip4_address, &m->external_addr, m->external_port); + vec_foreach (local, m->locals) + s = format (s, "\n local %U:%d probability %d\%", + format_ip4_address, &local->addr, local->port, + local->probability); + } + else + s = format (s, "%U local %U:%d external %U:%d vrf %d", + format_snat_protocol, m->proto, + format_ip4_address, &m->local_addr, m->local_port, + format_ip4_address, &m->external_addr, m->external_port, + m->vrf_id); + } + return s; +} + +u8 * format_snat_static_map_to_resolve (u8 * s, va_list * args) +{ + snat_static_map_resolve_t *m = va_arg (*args, snat_static_map_resolve_t *); + vnet_main_t *vnm = vnet_get_main(); + + if (m->addr_only) + s = format (s, "local %U external %U vrf %d", + format_ip4_address, &m->l_addr, + format_vnet_sw_interface_name, vnm, + vnet_get_sw_interface (vnm, m->sw_if_index), + m->vrf_id); + else + s = format (s, "%U local %U:%d external %U:%d vrf %d", + format_snat_protocol, m->proto, + format_ip4_address, &m->l_addr, m->l_port, + format_vnet_sw_interface_name, vnm, + vnet_get_sw_interface (vnm, m->sw_if_index), m->e_port, + m->vrf_id); + + return s; +} + +u8 * format_det_map_ses (u8 * s, va_list * args) +{ + snat_det_map_t * det_map = va_arg (*args, snat_det_map_t *); + ip4_address_t in_addr, out_addr; + u32 in_offset, out_offset; + snat_det_session_t * ses = va_arg (*args, snat_det_session_t *); + u32 * i = va_arg (*args, u32 *); + + u32 user_index = *i / SNAT_DET_SES_PER_USER; + in_addr.as_u32 = clib_host_to_net_u32 ( + clib_net_to_host_u32(det_map->in_addr.as_u32) + user_index); + in_offset = clib_net_to_host_u32(in_addr.as_u32) - + clib_net_to_host_u32(det_map->in_addr.as_u32); + out_offset = in_offset / det_map->sharing_ratio; + out_addr.as_u32 = clib_host_to_net_u32( + clib_net_to_host_u32(det_map->out_addr.as_u32) + out_offset); + s = format (s, "in %U:%d out %U:%d external host %U:%d state: %U expire: %d\n", + format_ip4_address, &in_addr, + clib_net_to_host_u16 (ses->in_port), + format_ip4_address, &out_addr, + clib_net_to_host_u16 (ses->out.out_port), + format_ip4_address, &ses->out.ext_host_addr, + clib_net_to_host_u16 (ses->out.ext_host_port), + format_snat_session_state, ses->state, + ses->expire); + + return s; +} + +static clib_error_t * +show_snat_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + int verbose = 0; + snat_main_t * sm = &snat_main; + snat_user_t * u; + snat_static_mapping_t *m; + snat_interface_t *i; + snat_address_t * ap; + vnet_main_t *vnm = vnet_get_main(); + snat_main_per_thread_data_t *tsm; + u32 users_num = 0, sessions_num = 0, *worker, *sw_if_index; + uword j = 0; + snat_static_map_resolve_t *rp; + snat_det_map_t * dm; + snat_det_session_t * ses; + + if (unformat (input, "detail")) + verbose = 1; + else if (unformat (input, "verbose")) + verbose = 2; + + if (sm->static_mapping_only) + { + if (sm->static_mapping_connection_tracking) + vlib_cli_output (vm, "NAT plugin mode: static mapping only connection " + "tracking"); + else + vlib_cli_output (vm, "NAT plugin mode: static mapping only"); + } + else if (sm->deterministic) + { + vlib_cli_output (vm, "NAT plugin mode: deterministic mapping"); + } + else + { + vlib_cli_output (vm, "NAT plugin mode: dynamic translations enabled"); + } + + if (verbose > 0) + { + pool_foreach (i, sm->interfaces, + ({ + vlib_cli_output (vm, "%U %s", format_vnet_sw_interface_name, vnm, + vnet_get_sw_interface (vnm, i->sw_if_index), + i->is_inside ? "in" : "out"); + })); + + pool_foreach (i, sm->output_feature_interfaces, + ({ + vlib_cli_output (vm, "%U output-feature %s", + format_vnet_sw_interface_name, vnm, + vnet_get_sw_interface (vnm, i->sw_if_index), + i->is_inside ? "in" : "out"); + })); + + if (vec_len (sm->auto_add_sw_if_indices)) + { + vlib_cli_output (vm, "NAT44 pool addresses interfaces:"); + vec_foreach (sw_if_index, sm->auto_add_sw_if_indices) + { + vlib_cli_output (vm, "%U", format_vnet_sw_interface_name, vnm, + vnet_get_sw_interface (vnm, *sw_if_index)); + } + } + + vec_foreach (ap, sm->addresses) + { + vlib_cli_output (vm, "%U", format_ip4_address, &ap->addr); + if (ap->fib_index != ~0) + vlib_cli_output (vm, " tenant VRF: %u", + ip4_fib_get(ap->fib_index)->table_id); + else + vlib_cli_output (vm, " tenant VRF independent"); +#define _(N, i, n, s) \ + vlib_cli_output (vm, " %d busy %s ports", ap->busy_##n##_ports, s); + foreach_snat_protocol +#undef _ + } + } + + if (sm->num_workers > 1) + { + vlib_cli_output (vm, "%d workers", vec_len (sm->workers)); + if (verbose > 0) + { + vec_foreach (worker, sm->workers) + { + vlib_worker_thread_t *w = + vlib_worker_threads + *worker + sm->first_worker_index; + vlib_cli_output (vm, " %s", w->name); + } + } + } + + if (sm->deterministic) + { + vlib_cli_output (vm, "udp timeout: %dsec", sm->udp_timeout); + vlib_cli_output (vm, "tcp-established timeout: %dsec", + sm->tcp_established_timeout); + vlib_cli_output (vm, "tcp-transitory timeout: %dsec", + sm->tcp_transitory_timeout); + vlib_cli_output (vm, "icmp timeout: %dsec", sm->icmp_timeout); + vlib_cli_output (vm, "%d deterministic mappings", + pool_elts (sm->det_maps)); + if (verbose > 0) + { + pool_foreach (dm, sm->det_maps, + ({ + vlib_cli_output (vm, "in %U/%d out %U/%d\n", + format_ip4_address, &dm->in_addr, dm->in_plen, + format_ip4_address, &dm->out_addr, dm->out_plen); + vlib_cli_output (vm, " outside address sharing ratio: %d\n", + dm->sharing_ratio); + vlib_cli_output (vm, " number of ports per inside host: %d\n", + dm->ports_per_host); + vlib_cli_output (vm, " sessions number: %d\n", dm->ses_num); + if (verbose > 1) + { + vec_foreach_index (j, dm->sessions) + { + ses = vec_elt_at_index (dm->sessions, j); + if (ses->in_port) + vlib_cli_output (vm, " %U", format_det_map_ses, dm, ses, + &j); + } + } + })); + } + } + else + { + if (sm->static_mapping_only && !(sm->static_mapping_connection_tracking)) + { + vlib_cli_output (vm, "%d static mappings", + pool_elts (sm->static_mappings)); + + if (verbose > 0) + { + pool_foreach (m, sm->static_mappings, + ({ + vlib_cli_output (vm, "%U", format_snat_static_mapping, m); + })); + } + } + else + { + vec_foreach (tsm, sm->per_thread_data) + { + users_num += pool_elts (tsm->users); + sessions_num += pool_elts (tsm->sessions); + } + + vlib_cli_output (vm, "%d users, %d outside addresses, %d active sessions," + " %d static mappings", + users_num, + vec_len (sm->addresses), + sessions_num, + pool_elts (sm->static_mappings)); + + if (verbose > 0) + { + vlib_cli_output (vm, "%U", format_bihash_16_8, &sm->in2out_ed, + verbose - 1); + vlib_cli_output (vm, "%U", format_bihash_16_8, &sm->out2in_ed, + verbose - 1); + vec_foreach_index (j, sm->per_thread_data) + { + tsm = vec_elt_at_index (sm->per_thread_data, j); + + if (pool_elts (tsm->users) == 0) + continue; + + vlib_worker_thread_t *w = vlib_worker_threads + j; + vlib_cli_output (vm, "Thread %d (%s at lcore %u):", j, w->name, + w->lcore_id); + vlib_cli_output (vm, " %U", format_bihash_8_8, &tsm->in2out, + verbose - 1); + vlib_cli_output (vm, " %U", format_bihash_8_8, &tsm->out2in, + verbose - 1); + vlib_cli_output (vm, " %d list pool elements", + pool_elts (tsm->list_pool)); + + pool_foreach (u, tsm->users, + ({ + vlib_cli_output (vm, " %U", format_snat_user, tsm, u, + verbose - 1); + })); + } + + if (pool_elts (sm->static_mappings)) + { + vlib_cli_output (vm, "static mappings:"); + pool_foreach (m, sm->static_mappings, + ({ + vlib_cli_output (vm, "%U", format_snat_static_mapping, m); + })); + for (j = 0; j < vec_len (sm->to_resolve); j++) + { + rp = sm->to_resolve + j; + vlib_cli_output (vm, "%U", + format_snat_static_map_to_resolve, rp); + } + } + } + } + } + return 0; +} + +VLIB_CLI_COMMAND (show_snat_command, static) = { + .path = "show nat44", + .short_help = "show nat44", + .function = show_snat_command_fn, +}; + + +static void +snat_ip4_add_del_interface_address_cb (ip4_main_t * im, + uword opaque, + u32 sw_if_index, + ip4_address_t * address, + u32 address_length, + u32 if_address_index, + u32 is_delete) +{ + snat_main_t *sm = &snat_main; + snat_static_map_resolve_t *rp; + u32 *indices_to_delete = 0; + int i, j; + int rv; + + for (i = 0; i < vec_len(sm->auto_add_sw_if_indices); i++) + { + if (sw_if_index == sm->auto_add_sw_if_indices[i]) + { + if (!is_delete) + { + /* Don't trip over lease renewal, static config */ + for (j = 0; j < vec_len(sm->addresses); j++) + if (sm->addresses[j].addr.as_u32 == address->as_u32) + return; + + snat_add_address (sm, address, ~0); + /* Scan static map resolution vector */ + for (j = 0; j < vec_len (sm->to_resolve); j++) + { + rp = sm->to_resolve + j; + /* On this interface? */ + if (rp->sw_if_index == sw_if_index) + { + /* Add the static mapping */ + rv = snat_add_static_mapping (rp->l_addr, + address[0], + rp->l_port, + rp->e_port, + rp->vrf_id, + rp->addr_only, + ~0 /* sw_if_index */, + rp->proto, + rp->is_add); + if (rv) + clib_warning ("snat_add_static_mapping returned %d", + rv); + vec_add1 (indices_to_delete, j); + } + } + /* If we resolved any of the outstanding static mappings */ + if (vec_len(indices_to_delete)) + { + /* Delete them */ + for (j = vec_len(indices_to_delete)-1; j >= 0; j--) + vec_delete(sm->to_resolve, 1, j); + vec_free(indices_to_delete); + } + return; + } + else + { + (void) snat_del_address(sm, address[0], 1); + return; + } + } + } +} + + +int snat_add_interface_address (snat_main_t *sm, u32 sw_if_index, int is_del) +{ + ip4_main_t * ip4_main = sm->ip4_main; + ip4_address_t * first_int_addr; + snat_static_map_resolve_t *rp; + u32 *indices_to_delete = 0; + int i, j; + + first_int_addr = ip4_interface_first_address (ip4_main, sw_if_index, + 0 /* just want the address*/); + + for (i = 0; i < vec_len(sm->auto_add_sw_if_indices); i++) + { + if (sm->auto_add_sw_if_indices[i] == sw_if_index) + { + if (is_del) + { + /* if have address remove it */ + if (first_int_addr) + (void) snat_del_address (sm, first_int_addr[0], 1); + else + { + for (j = 0; j < vec_len (sm->to_resolve); j++) + { + rp = sm->to_resolve + j; + if (rp->sw_if_index == sw_if_index) + vec_add1 (indices_to_delete, j); + } + if (vec_len(indices_to_delete)) + { + for (j = vec_len(indices_to_delete)-1; j >= 0; j--) + vec_del1(sm->to_resolve, j); + vec_free(indices_to_delete); + } + } + vec_del1(sm->auto_add_sw_if_indices, i); + } + else + return VNET_API_ERROR_VALUE_EXIST; + + return 0; + } + } + + if (is_del) + return VNET_API_ERROR_NO_SUCH_ENTRY; + + /* add to the auto-address list */ + vec_add1(sm->auto_add_sw_if_indices, sw_if_index); + + /* If the address is already bound - or static - add it now */ + if (first_int_addr) + snat_add_address (sm, first_int_addr, ~0); + + return 0; +} + +static clib_error_t * +snat_add_interface_address_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + snat_main_t *sm = &snat_main; + unformat_input_t _line_input, *line_input = &_line_input; + u32 sw_if_index; + int rv; + int is_del = 0; + clib_error_t *error = 0; + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "%U", unformat_vnet_sw_interface, + sm->vnet_main, &sw_if_index)) + ; + else if (unformat (line_input, "del")) + is_del = 1; + else + { + error = clib_error_return (0, "unknown input '%U'", + format_unformat_error, line_input); + goto done; + } + } + + rv = snat_add_interface_address (sm, sw_if_index, is_del); + + switch (rv) + { + case 0: + break; + + default: + error = clib_error_return (0, "snat_add_interface_address returned %d", + rv); + goto done; + } + +done: + unformat_free (line_input); + + return error; +} + +VLIB_CLI_COMMAND (snat_add_interface_address_command, static) = { + .path = "nat44 add interface address", + .short_help = "nat44 add interface address <interface> [del]", + .function = snat_add_interface_address_command_fn, +}; + +static clib_error_t * +snat_det_map_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + snat_main_t *sm = &snat_main; + unformat_input_t _line_input, *line_input = &_line_input; + ip4_address_t in_addr, out_addr; + u32 in_plen, out_plen; + int is_add = 1, rv; + clib_error_t *error = 0; + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "in %U/%u", unformat_ip4_address, &in_addr, &in_plen)) + ; + else if (unformat (line_input, "out %U/%u", unformat_ip4_address, &out_addr, &out_plen)) + ; + else if (unformat (line_input, "del")) + is_add = 0; + else + { + error = clib_error_return (0, "unknown input '%U'", + format_unformat_error, line_input); + goto done; + } + } + + unformat_free (line_input); + + rv = snat_det_add_map(sm, &in_addr, (u8) in_plen, &out_addr, (u8)out_plen, + is_add); + + if (rv) + { + error = clib_error_return (0, "snat_det_add_map return %d", rv); + goto done; + } + +done: + unformat_free (line_input); + + return error; +} + +/*? + * @cliexpar + * @cliexstart{snat deterministic add} + * Create bijective mapping of inside address to outside address and port range + * pairs, with the purpose of enabling deterministic NAT to reduce logging in + * CGN deployments. + * To create deterministic mapping between inside network 10.0.0.0/18 and + * outside network 1.1.1.0/30 use: + * # vpp# nat44 deterministic add in 10.0.0.0/18 out 1.1.1.0/30 + * @cliexend +?*/ +VLIB_CLI_COMMAND (snat_det_map_command, static) = { + .path = "nat44 deterministic add", + .short_help = "nat44 deterministic add in <addr>/<plen> out <addr>/<plen> [del]", + .function = snat_det_map_command_fn, +}; + +static clib_error_t * +snat_det_forward_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + snat_main_t *sm = &snat_main; + unformat_input_t _line_input, *line_input = &_line_input; + ip4_address_t in_addr, out_addr; + u16 lo_port; + snat_det_map_t * dm; + clib_error_t *error = 0; + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "%U", unformat_ip4_address, &in_addr)) + ; + else + { + error = clib_error_return (0, "unknown input '%U'", + format_unformat_error, line_input); + goto done; + } + } + + unformat_free (line_input); + + dm = snat_det_map_by_user(sm, &in_addr); + if (!dm) + vlib_cli_output (vm, "no match"); + else + { + snat_det_forward (dm, &in_addr, &out_addr, &lo_port); + vlib_cli_output (vm, "%U:<%d-%d>", format_ip4_address, &out_addr, + lo_port, lo_port + dm->ports_per_host - 1); + } + +done: + unformat_free (line_input); + + return error; +} + +/*? + * @cliexpar + * @cliexstart{snat deterministic forward} + * Return outside address and port range from inside address for deterministic + * NAT. + * To obtain outside address and port of inside host use: + * vpp# nat44 deterministic forward 10.0.0.2 + * 1.1.1.0:<1054-1068> + * @cliexend +?*/ +VLIB_CLI_COMMAND (snat_det_forward_command, static) = { + .path = "nat44 deterministic forward", + .short_help = "nat44 deterministic forward <addr>", + .function = snat_det_forward_command_fn, +}; + +static clib_error_t * +snat_det_reverse_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + snat_main_t *sm = &snat_main; + unformat_input_t _line_input, *line_input = &_line_input; + ip4_address_t in_addr, out_addr; + u32 out_port; + snat_det_map_t * dm; + clib_error_t *error = 0; + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "%U:%d", unformat_ip4_address, &out_addr, &out_port)) + ; + else + { + error = clib_error_return (0, "unknown input '%U'", + format_unformat_error, line_input); + } + } + + unformat_free (line_input); + + if (out_port < 1024 || out_port > 65535) + { + error = clib_error_return (0, "wrong port, must be <1024-65535>"); + goto done; + } + + dm = snat_det_map_by_out(sm, &out_addr); + if (!dm) + vlib_cli_output (vm, "no match"); + else + { + snat_det_reverse (dm, &out_addr, (u16) out_port, &in_addr); + vlib_cli_output (vm, "%U", format_ip4_address, &in_addr); + } + +done: + unformat_free (line_input); + + return error; +} + +/*? + * @cliexpar + * @cliexstart{snat deterministic reverse} + * Return inside address from outside address and port for deterministic NAT. + * To obtain inside host address from outside address and port use: + * #vpp nat44 deterministic reverse 1.1.1.1:1276 + * 10.0.16.16 + * @cliexend +?*/ +VLIB_CLI_COMMAND (snat_det_reverse_command, static) = { + .path = "nat44 deterministic reverse", + .short_help = "nat44 deterministic reverse <addr>:<port>", + .function = snat_det_reverse_command_fn, +}; + +static clib_error_t * +set_timeout_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + snat_main_t *sm = &snat_main; + unformat_input_t _line_input, *line_input = &_line_input; + clib_error_t *error = 0; + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "udp %u", &sm->udp_timeout)) + ; + else if (unformat (line_input, "tcp-established %u", + &sm->tcp_established_timeout)) + ; + else if (unformat (line_input, "tcp-transitory %u", + &sm->tcp_transitory_timeout)) + ; + else if (unformat (line_input, "icmp %u", &sm->icmp_timeout)) + ; + else if (unformat (line_input, "reset")) + { + sm->udp_timeout = SNAT_UDP_TIMEOUT; + sm->tcp_established_timeout = SNAT_TCP_ESTABLISHED_TIMEOUT; + sm->tcp_transitory_timeout = SNAT_TCP_TRANSITORY_TIMEOUT; + sm->icmp_timeout = SNAT_ICMP_TIMEOUT; + } + else + { + error = clib_error_return (0, "unknown input '%U'", + format_unformat_error, line_input); + goto done; + } + } + + unformat_free (line_input); + +done: + unformat_free (line_input); + + return error; +} + +/*? + * @cliexpar + * @cliexstart{set snat deterministic timeout} + * Set values of timeouts for deterministic NAT (in seconds), use: + * vpp# set nat44 deterministic timeout udp 120 tcp-established 7500 + * tcp-transitory 250 icmp 90 + * To reset default values use: + * vpp# set nat44 deterministic timeout reset + * @cliexend +?*/ +VLIB_CLI_COMMAND (set_timeout_command, static) = { + .path = "set nat44 deterministic timeout", + .function = set_timeout_command_fn, + .short_help = + "set nat44 deterministic timeout [udp <sec> | tcp-established <sec> " + "tcp-transitory <sec> | icmp <sec> | reset]", +}; + +static clib_error_t * +snat_det_close_session_out_fn (vlib_main_t *vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + snat_main_t *sm = &snat_main; + unformat_input_t _line_input, *line_input = &_line_input; + ip4_address_t out_addr, ext_addr, in_addr; + u32 out_port, ext_port; + snat_det_map_t * dm; + snat_det_session_t * ses; + snat_det_out_key_t key; + clib_error_t *error = 0; + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "%U:%d %U:%d", + unformat_ip4_address, &out_addr, &out_port, + unformat_ip4_address, &ext_addr, &ext_port)) + ; + else + { + error = clib_error_return (0, "unknown input '%U'", + format_unformat_error, line_input); + goto done; + } + } + + unformat_free (line_input); + + dm = snat_det_map_by_out(sm, &out_addr); + if (!dm) + vlib_cli_output (vm, "no match"); + else + { + snat_det_reverse(dm, &ext_addr, (u16)out_port, &in_addr); + key.ext_host_addr = out_addr; + key.ext_host_port = ntohs((u16)ext_port); + key.out_port = ntohs((u16)out_port); + ses = snat_det_get_ses_by_out(dm, &out_addr, key.as_u64); + if (!ses) + vlib_cli_output (vm, "no match"); + else + snat_det_ses_close(dm, ses); + } + +done: + unformat_free (line_input); + + return error; +} + +/*? + * @cliexpar + * @cliexstart{snat deterministic close session out} + * Close session using outside ip address and port + * and external ip address and port, use: + * vpp# nat44 deterministic close session out 1.1.1.1:1276 2.2.2.2:2387 + * @cliexend +?*/ +VLIB_CLI_COMMAND (snat_det_close_sesion_out_command, static) = { + .path = "nat44 deterministic close session out", + .short_help = "nat44 deterministic close session out " + "<out_addr>:<out_port> <ext_addr>:<ext_port>", + .function = snat_det_close_session_out_fn, +}; + +static clib_error_t * +snat_det_close_session_in_fn (vlib_main_t *vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + snat_main_t *sm = &snat_main; + unformat_input_t _line_input, *line_input = &_line_input; + ip4_address_t in_addr, ext_addr; + u32 in_port, ext_port; + snat_det_map_t * dm; + snat_det_session_t * ses; + snat_det_out_key_t key; + clib_error_t *error = 0; + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "%U:%d %U:%d", + unformat_ip4_address, &in_addr, &in_port, + unformat_ip4_address, &ext_addr, &ext_port)) + ; + else + { + error = clib_error_return (0, "unknown input '%U'", + format_unformat_error, line_input); + goto done; + } + } + + unformat_free (line_input); + + dm = snat_det_map_by_user (sm, &in_addr); + if (!dm) + vlib_cli_output (vm, "no match"); + else + { + key.ext_host_addr = ext_addr; + key.ext_host_port = ntohs ((u16)ext_port); + ses = snat_det_find_ses_by_in (dm, &in_addr, ntohs((u16)in_port), key); + if (!ses) + vlib_cli_output (vm, "no match"); + else + snat_det_ses_close(dm, ses); + } + +done: + unformat_free(line_input); + + return error; +} + +/*? + * @cliexpar + * @cliexstart{snat deterministic close_session_in} + * Close session using inside ip address and port + * and external ip address and port, use: + * vpp# nat44 deterministic close session in 3.3.3.3:3487 2.2.2.2:2387 + * @cliexend +?*/ +VLIB_CLI_COMMAND (snat_det_close_session_in_command, static) = { + .path = "nat44 deterministic close session in", + .short_help = "nat44 deterministic close session in " + "<in_addr>:<in_port> <ext_addr>:<ext_port>", + .function = snat_det_close_session_in_fn, +}; diff --git a/src/plugins/nat/nat.h b/src/plugins/nat/nat.h new file mode 100644 index 00000000..e53e924f --- /dev/null +++ b/src/plugins/nat/nat.h @@ -0,0 +1,555 @@ + +/* + * nat.h - NAT plugin definitions + * + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_nat_h__ +#define __included_nat_h__ + +#include <vnet/vnet.h> +#include <vnet/ip/ip.h> +#include <vnet/ethernet/ethernet.h> +#include <vnet/ip/icmp46_packet.h> +#include <vnet/api_errno.h> +#include <vppinfra/bihash_8_8.h> +#include <vppinfra/bihash_16_8.h> +#include <vppinfra/dlist.h> +#include <vppinfra/error.h> +#include <vlibapi/api.h> + + +#define SNAT_UDP_TIMEOUT 300 +#define SNAT_UDP_TIMEOUT_MIN 120 +#define SNAT_TCP_TRANSITORY_TIMEOUT 240 +#define SNAT_TCP_ESTABLISHED_TIMEOUT 7440 +#define SNAT_TCP_INCOMING_SYN 6 +#define SNAT_ICMP_TIMEOUT 60 + +#define SNAT_FLAG_HAIRPINNING (1 << 0) + +/* Key */ +typedef struct { + union + { + struct + { + ip4_address_t addr; + u16 port; + u16 protocol:3, + fib_index:13; + }; + u64 as_u64; + }; +} snat_session_key_t; + +typedef struct { + union + { + struct + { + ip4_address_t l_addr; + ip4_address_t r_addr; + u32 fib_index; + u16 l_port; + u8 proto; + u8 rsvd; + }; + u64 as_u64[2]; + }; +} nat_ed_ses_key_t; + +typedef struct { + union + { + struct + { + ip4_address_t ext_host_addr; + u16 ext_host_port; + u16 out_port; + }; + u64 as_u64; + }; +} snat_det_out_key_t; + +typedef struct { + union + { + struct + { + ip4_address_t addr; + u32 fib_index; + }; + u64 as_u64; + }; +} snat_user_key_t; + + +#define foreach_snat_protocol \ + _(UDP, 0, udp, "udp") \ + _(TCP, 1, tcp, "tcp") \ + _(ICMP, 2, icmp, "icmp") + +typedef enum { +#define _(N, i, n, s) SNAT_PROTOCOL_##N = i, + foreach_snat_protocol +#undef _ +} snat_protocol_t; + + +#define foreach_snat_session_state \ + _(0, UNKNOWN, "unknown") \ + _(1, UDP_ACTIVE, "udp-active") \ + _(2, TCP_SYN_SENT, "tcp-syn-sent") \ + _(3, TCP_ESTABLISHED, "tcp-established") \ + _(4, TCP_FIN_WAIT, "tcp-fin-wait") \ + _(5, TCP_CLOSE_WAIT, "tcp-close-wait") \ + _(6, TCP_LAST_ACK, "tcp-last-ack") \ + _(7, ICMP_ACTIVE, "icmp-active") + +typedef enum { +#define _(v, N, s) SNAT_SESSION_##N = v, + foreach_snat_session_state +#undef _ +} snat_session_state_t; + + +#define SNAT_SESSION_FLAG_STATIC_MAPPING 1 +#define SNAT_SESSION_FLAG_UNKNOWN_PROTO 2 +#define SNAT_SESSION_FLAG_LOAD_BALANCING 4 + +typedef CLIB_PACKED(struct { + snat_session_key_t out2in; /* 0-15 */ + + snat_session_key_t in2out; /* 16-31 */ + + u32 flags; /* 32-35 */ + + /* per-user translations */ + u32 per_user_index; /* 36-39 */ + + u32 per_user_list_head_index; /* 40-43 */ + + /* Last heard timer */ + f64 last_heard; /* 44-51 */ + + u64 total_bytes; /* 52-59 */ + + u32 total_pkts; /* 60-63 */ + + /* Outside address */ + u32 outside_address_index; /* 64-67 */ + + /* External host address */ + ip4_address_t ext_host_addr; /* 68-71 */ + +}) snat_session_t; + + +typedef struct { + ip4_address_t addr; + u32 fib_index; + u32 sessions_per_user_list_head_index; + u32 nsessions; + u32 nstaticsessions; +} snat_user_t; + +typedef struct { + ip4_address_t addr; + u32 fib_index; +#define _(N, i, n, s) \ + u16 busy_##n##_ports; \ + u16 * busy_##n##_ports_per_thread; \ + uword * busy_##n##_port_bitmap; + foreach_snat_protocol +#undef _ +} snat_address_t; + +typedef struct { + u16 in_port; + snat_det_out_key_t out; + u8 state; + u32 expire; +} snat_det_session_t; + +typedef struct { + ip4_address_t in_addr; + u8 in_plen; + ip4_address_t out_addr; + u8 out_plen; + u32 sharing_ratio; + u16 ports_per_host; + u32 ses_num; + /* vector of sessions */ + snat_det_session_t * sessions; +} snat_det_map_t; + +typedef struct { + ip4_address_t addr; + u16 port; + u8 probability; + u8 prefix; +} nat44_lb_addr_port_t; + +typedef struct { + ip4_address_t local_addr; + ip4_address_t external_addr; + u16 local_port; + u16 external_port; + u8 addr_only; + u32 vrf_id; + u32 fib_index; + snat_protocol_t proto; + u32 worker_index; + nat44_lb_addr_port_t *locals; +} snat_static_mapping_t; + +typedef struct { + u32 sw_if_index; + u8 is_inside; +} snat_interface_t; + +typedef struct { + ip4_address_t l_addr; + u16 l_port; + u16 e_port; + u32 sw_if_index; + u32 vrf_id; + snat_protocol_t proto; + int addr_only; + int is_add; +} snat_static_map_resolve_t; + +typedef struct { + /* Main lookup tables */ + clib_bihash_8_8_t out2in; + clib_bihash_8_8_t in2out; + + /* Find-a-user => src address lookup */ + clib_bihash_8_8_t user_hash; + + /* User pool */ + snat_user_t * users; + + /* Session pool */ + snat_session_t * sessions; + + /* Pool of doubly-linked list elements */ + dlist_elt_t * list_pool; + + u32 snat_thread_index; +} snat_main_per_thread_data_t; + +struct snat_main_s; + +typedef u32 snat_icmp_match_function_t (struct snat_main_s *sm, + vlib_node_runtime_t *node, + u32 thread_index, + vlib_buffer_t *b0, + ip4_header_t *ip0, + u8 *p_proto, + snat_session_key_t *p_value, + u8 *p_dont_translate, + void *d, + void *e); + +typedef u32 (snat_get_worker_function_t) (ip4_header_t * ip, u32 rx_fib_index); + +typedef struct snat_main_s { + /* Endpoint address dependent sessions lookup tables */ + clib_bihash_16_8_t out2in_ed; + clib_bihash_16_8_t in2out_ed; + + snat_icmp_match_function_t * icmp_match_in2out_cb; + snat_icmp_match_function_t * icmp_match_out2in_cb; + + u32 num_workers; + u32 first_worker_index; + u32 next_worker; + u32 * workers; + snat_get_worker_function_t * worker_in2out_cb; + snat_get_worker_function_t * worker_out2in_cb; + u16 port_per_thread; + u32 num_snat_thread; + + /* Per thread data */ + snat_main_per_thread_data_t * per_thread_data; + + /* Find a static mapping by local */ + clib_bihash_8_8_t static_mapping_by_local; + + /* Find a static mapping by external */ + clib_bihash_8_8_t static_mapping_by_external; + + /* Static mapping pool */ + snat_static_mapping_t * static_mappings; + + /* Interface pool */ + snat_interface_t * interfaces; + snat_interface_t * output_feature_interfaces; + + /* Vector of outside addresses */ + snat_address_t * addresses; + + /* sw_if_indices whose intfc addresses should be auto-added */ + u32 * auto_add_sw_if_indices; + + /* vector of interface address static mappings to resolve. */ + snat_static_map_resolve_t *to_resolve; + + /* Randomize port allocation order */ + u32 random_seed; + + /* Worker handoff index */ + u32 fq_in2out_index; + u32 fq_in2out_output_index; + u32 fq_out2in_index; + + /* in2out and out2in node index */ + u32 in2out_node_index; + u32 in2out_output_node_index; + u32 out2in_node_index; + + /* Deterministic NAT */ + snat_det_map_t * det_maps; + + /* Config parameters */ + u8 static_mapping_only; + u8 static_mapping_connection_tracking; + u8 deterministic; + u32 translation_buckets; + u32 translation_memory_size; + u32 max_translations; + u32 user_buckets; + u32 user_memory_size; + u32 max_translations_per_user; + u32 outside_vrf_id; + u32 outside_fib_index; + u32 inside_vrf_id; + u32 inside_fib_index; + + /* tenant VRF aware address pool activation flag */ + u8 vrf_mode; + + /* values of various timeouts */ + u32 udp_timeout; + u32 tcp_established_timeout; + u32 tcp_transitory_timeout; + u32 icmp_timeout; + + /* API message ID base */ + u16 msg_id_base; + + /* convenience */ + vlib_main_t * vlib_main; + vnet_main_t * vnet_main; + ip4_main_t * ip4_main; + ip_lookup_main_t * ip4_lookup_main; + api_main_t * api_main; +} snat_main_t; + +extern snat_main_t snat_main; +extern vlib_node_registration_t snat_in2out_node; +extern vlib_node_registration_t snat_in2out_output_node; +extern vlib_node_registration_t snat_out2in_node; +extern vlib_node_registration_t snat_in2out_fast_node; +extern vlib_node_registration_t snat_out2in_fast_node; +extern vlib_node_registration_t snat_in2out_worker_handoff_node; +extern vlib_node_registration_t snat_in2out_output_worker_handoff_node; +extern vlib_node_registration_t snat_out2in_worker_handoff_node; +extern vlib_node_registration_t snat_det_in2out_node; +extern vlib_node_registration_t snat_det_out2in_node; +extern vlib_node_registration_t snat_hairpin_dst_node; +extern vlib_node_registration_t snat_hairpin_src_node; + +void snat_free_outside_address_and_port (snat_main_t * sm, + u32 thread_index, + snat_session_key_t * k, + u32 address_index); + +int snat_alloc_outside_address_and_port (snat_main_t * sm, + u32 fib_index, + u32 thread_index, + snat_session_key_t * k, + u32 * address_indexp); + +int snat_static_mapping_match (snat_main_t * sm, + snat_session_key_t match, + snat_session_key_t * mapping, + u8 by_external, + u8 *is_addr_only); + +void snat_add_del_addr_to_fib (ip4_address_t * addr, + u8 p_len, + u32 sw_if_index, + int is_add); + +format_function_t format_snat_user; + +typedef struct { + u32 cached_sw_if_index; + u32 cached_ip4_address; +} snat_runtime_t; + +/** \brief Check if SNAT session is created from static mapping. + @param s SNAT session + @return 1 if SNAT session is created from static mapping otherwise 0 +*/ +#define snat_is_session_static(s) s->flags & SNAT_SESSION_FLAG_STATIC_MAPPING + +/** \brief Check if SNAT session for unknown protocol. + @param s SNAT session + @return 1 if SNAT session for unknown protocol otherwise 0 +*/ +#define snat_is_unk_proto_session(s) s->flags & SNAT_SESSION_FLAG_UNKNOWN_PROTO + +/* + * Why is this here? Because we don't need to touch this layer to + * simply reply to an icmp. We need to change id to a unique + * value to NAT an echo request/reply. + */ + +typedef struct { + u16 identifier; + u16 sequence; +} icmp_echo_header_t; + +always_inline u32 +ip_proto_to_snat_proto (u8 ip_proto) +{ + u32 snat_proto = ~0; + + snat_proto = (ip_proto == IP_PROTOCOL_UDP) ? SNAT_PROTOCOL_UDP : snat_proto; + snat_proto = (ip_proto == IP_PROTOCOL_TCP) ? SNAT_PROTOCOL_TCP : snat_proto; + snat_proto = (ip_proto == IP_PROTOCOL_ICMP) ? SNAT_PROTOCOL_ICMP : snat_proto; + snat_proto = (ip_proto == IP_PROTOCOL_ICMP6) ? SNAT_PROTOCOL_ICMP : snat_proto; + + return snat_proto; +} + +always_inline u8 +snat_proto_to_ip_proto (snat_protocol_t snat_proto) +{ + u8 ip_proto = ~0; + + ip_proto = (snat_proto == SNAT_PROTOCOL_UDP) ? IP_PROTOCOL_UDP : ip_proto; + ip_proto = (snat_proto == SNAT_PROTOCOL_TCP) ? IP_PROTOCOL_TCP : ip_proto; + ip_proto = (snat_proto == SNAT_PROTOCOL_ICMP) ? IP_PROTOCOL_ICMP : ip_proto; + + return ip_proto; +} + +typedef struct { + u16 src_port, dst_port; +} tcp_udp_header_t; + +u32 icmp_match_in2out_fast(snat_main_t *sm, vlib_node_runtime_t *node, + u32 thread_index, vlib_buffer_t *b0, + ip4_header_t *ip0, u8 *p_proto, + snat_session_key_t *p_value, + u8 *p_dont_translate, void *d, void *e); +u32 icmp_match_in2out_slow(snat_main_t *sm, vlib_node_runtime_t *node, + u32 thread_index, vlib_buffer_t *b0, + ip4_header_t *ip0, u8 *p_proto, + snat_session_key_t *p_value, + u8 *p_dont_translate, void *d, void *e); +u32 icmp_match_in2out_det(snat_main_t *sm, vlib_node_runtime_t *node, + u32 thread_index, vlib_buffer_t *b0, + ip4_header_t *ip0, u8 *p_proto, + snat_session_key_t *p_value, + u8 *p_dont_translate, void *d, void *e); +u32 icmp_match_out2in_fast(snat_main_t *sm, vlib_node_runtime_t *node, + u32 thread_index, vlib_buffer_t *b0, + ip4_header_t *ip0, u8 *p_proto, + snat_session_key_t *p_value, + u8 *p_dont_translate, void *d, void *e); +u32 icmp_match_out2in_slow(snat_main_t *sm, vlib_node_runtime_t *node, + u32 thread_index, vlib_buffer_t *b0, + ip4_header_t *ip0, u8 *p_proto, + snat_session_key_t *p_value, + u8 *p_dont_translate, void *d, void *e); +u32 icmp_match_out2in_det(snat_main_t *sm, vlib_node_runtime_t *node, + u32 thread_index, vlib_buffer_t *b0, + ip4_header_t *ip0, u8 *p_proto, + snat_session_key_t *p_value, + u8 *p_dont_translate, void *d, void *e); +void increment_v4_address(ip4_address_t * a); +void snat_add_address(snat_main_t *sm, ip4_address_t *addr, u32 vrf_id); +int snat_del_address(snat_main_t *sm, ip4_address_t addr, u8 delete_sm); +int snat_add_static_mapping(ip4_address_t l_addr, ip4_address_t e_addr, + u16 l_port, u16 e_port, u32 vrf_id, int addr_only, + u32 sw_if_index, snat_protocol_t proto, int is_add); +clib_error_t * snat_api_init(vlib_main_t * vm, snat_main_t * sm); +int snat_set_workers (uword * bitmap); +int snat_interface_add_del(u32 sw_if_index, u8 is_inside, int is_del); +int snat_interface_add_del_output_feature(u32 sw_if_index, u8 is_inside, + int is_del); +int snat_add_interface_address(snat_main_t *sm, u32 sw_if_index, int is_del); +uword unformat_snat_protocol(unformat_input_t * input, va_list * args); +u8 * format_snat_protocol(u8 * s, va_list * args); +int nat44_add_del_lb_static_mapping (ip4_address_t e_addr, u16 e_port, + snat_protocol_t proto, u32 vrf_id, + nat44_lb_addr_port_t *locals, u8 is_add); + +static_always_inline u8 +icmp_is_error_message (icmp46_header_t * icmp) +{ + switch(icmp->type) + { + case ICMP4_destination_unreachable: + case ICMP4_time_exceeded: + case ICMP4_parameter_problem: + case ICMP4_source_quench: + case ICMP4_redirect: + case ICMP4_alternate_host_address: + return 1; + } + return 0; +} + +static_always_inline u8 +is_interface_addr(snat_main_t *sm, vlib_node_runtime_t *node, u32 sw_if_index0, + u32 ip4_addr) +{ + snat_runtime_t *rt = (snat_runtime_t *) node->runtime_data; + ip4_address_t * first_int_addr; + + if (PREDICT_FALSE(rt->cached_sw_if_index != sw_if_index0)) + { + first_int_addr = + ip4_interface_first_address (sm->ip4_main, sw_if_index0, + 0 /* just want the address */); + rt->cached_sw_if_index = sw_if_index0; + if (first_int_addr) + rt->cached_ip4_address = first_int_addr->as_u32; + else + rt->cached_ip4_address = 0; + } + + if (PREDICT_FALSE(ip4_addr == rt->cached_ip4_address)) + return 1; + else + return 0; +} + +always_inline u8 +maximum_sessions_exceeded (snat_main_t *sm, u32 thread_index) +{ + if (pool_elts (sm->per_thread_data[thread_index].sessions) >= sm->max_translations) + return 1; + + return 0; +} + +#endif /* __included_nat_h__ */ diff --git a/src/plugins/nat/nat64.c b/src/plugins/nat/nat64.c new file mode 100644 index 00000000..bfcfa9b3 --- /dev/null +++ b/src/plugins/nat/nat64.c @@ -0,0 +1,861 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @file + * @brief NAT64 implementation + */ + +#include <nat/nat64.h> +#include <nat/nat64_db.h> +#include <vnet/fib/ip4_fib.h> + + +nat64_main_t nat64_main; + +/* *INDENT-OFF* */ + +/* Hook up input features */ +VNET_FEATURE_INIT (nat64_in2out, static) = { + .arc_name = "ip6-unicast", + .node_name = "nat64-in2out", + .runs_before = VNET_FEATURES ("ip6-lookup"), +}; +VNET_FEATURE_INIT (nat64_out2in, static) = { + .arc_name = "ip4-unicast", + .node_name = "nat64-out2in", + .runs_before = VNET_FEATURES ("ip4-lookup"), +}; + +static u8 well_known_prefix[] = { + 0x00, 0x64, 0xff, 0x9b, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00 +}; + +/* *INDENT-ON* */ + +clib_error_t * +nat64_init (vlib_main_t * vm) +{ + nat64_main_t *nm = &nat64_main; + clib_error_t *error = 0; + vlib_thread_main_t *tm = vlib_get_thread_main (); + + nm->is_disabled = 0; + + if (tm->n_vlib_mains > 1) + { + nm->is_disabled = 1; + goto error; + } + + if (nat64_db_init (&nm->db)) + { + error = clib_error_return (0, "NAT64 DB init failed"); + goto error; + } + + /* set session timeouts to default values */ + nm->udp_timeout = SNAT_UDP_TIMEOUT; + nm->icmp_timeout = SNAT_ICMP_TIMEOUT; + nm->tcp_trans_timeout = SNAT_TCP_TRANSITORY_TIMEOUT; + nm->tcp_est_timeout = SNAT_TCP_ESTABLISHED_TIMEOUT; + nm->tcp_incoming_syn_timeout = SNAT_TCP_INCOMING_SYN; + +error: + return error; +} + +int +nat64_add_del_pool_addr (ip4_address_t * addr, u32 vrf_id, u8 is_add) +{ + nat64_main_t *nm = &nat64_main; + snat_address_t *a = 0; + snat_interface_t *interface; + int i; + + /* Check if address already exists */ + for (i = 0; i < vec_len (nm->addr_pool); i++) + { + if (nm->addr_pool[i].addr.as_u32 == addr->as_u32) + { + a = nm->addr_pool + i; + break; + } + } + + if (is_add) + { + if (a) + return VNET_API_ERROR_VALUE_EXIST; + + vec_add2 (nm->addr_pool, a, 1); + a->addr = *addr; + a->fib_index = 0; + if (vrf_id != ~0) + a->fib_index = + fib_table_find_or_create_and_lock (FIB_PROTOCOL_IP6, vrf_id, + FIB_SOURCE_PLUGIN_HI); +#define _(N, i, n, s) \ + clib_bitmap_alloc (a->busy_##n##_port_bitmap, 65535); + foreach_snat_protocol +#undef _ + } + else + { + if (!a) + return VNET_API_ERROR_NO_SUCH_ENTRY; + + if (a->fib_index) + fib_table_unlock (a->fib_index, FIB_PROTOCOL_IP6, + FIB_SOURCE_PLUGIN_HI); + +#define _(N, id, n, s) \ + clib_bitmap_free (a->busy_##n##_port_bitmap); + foreach_snat_protocol +#undef _ + /* Delete sessions using address */ + nat64_db_free_out_addr (&nm->db, &a->addr); + vec_del1 (nm->addr_pool, i); + } + + /* Add/del external address to FIB */ + /* *INDENT-OFF* */ + pool_foreach (interface, nm->interfaces, + ({ + if (interface->is_inside) + continue; + + snat_add_del_addr_to_fib (addr, 32, interface->sw_if_index, is_add); + break; + })); + /* *INDENT-ON* */ + + return 0; +} + +void +nat64_pool_addr_walk (nat64_pool_addr_walk_fn_t fn, void *ctx) +{ + nat64_main_t *nm = &nat64_main; + snat_address_t *a = 0; + + /* *INDENT-OFF* */ + vec_foreach (a, nm->addr_pool) + { + if (fn (a, ctx)) + break; + }; + /* *INDENT-ON* */ +} + +int +nat64_add_del_interface (u32 sw_if_index, u8 is_inside, u8 is_add) +{ + nat64_main_t *nm = &nat64_main; + snat_interface_t *interface = 0, *i; + snat_address_t *ap; + const char *feature_name, *arc_name; + + /* Check if address already exists */ + /* *INDENT-OFF* */ + pool_foreach (i, nm->interfaces, + ({ + if (i->sw_if_index == sw_if_index) + { + interface = i; + break; + } + })); + /* *INDENT-ON* */ + + if (is_add) + { + if (interface) + return VNET_API_ERROR_VALUE_EXIST; + + pool_get (nm->interfaces, interface); + interface->sw_if_index = sw_if_index; + interface->is_inside = is_inside; + + } + else + { + if (!interface) + return VNET_API_ERROR_NO_SUCH_ENTRY; + + pool_put (nm->interfaces, interface); + } + + if (!is_inside) + { + /* *INDENT-OFF* */ + vec_foreach (ap, nm->addr_pool) + snat_add_del_addr_to_fib(&ap->addr, 32, sw_if_index, is_add); + /* *INDENT-ON* */ + } + + arc_name = is_inside ? "ip6-unicast" : "ip4-unicast"; + feature_name = is_inside ? "nat64-in2out" : "nat64-out2in"; + + return vnet_feature_enable_disable (arc_name, feature_name, sw_if_index, + is_add, 0, 0); +} + +void +nat64_interfaces_walk (nat64_interface_walk_fn_t fn, void *ctx) +{ + nat64_main_t *nm = &nat64_main; + snat_interface_t *i = 0; + + /* *INDENT-OFF* */ + pool_foreach (i, nm->interfaces, + ({ + if (fn (i, ctx)) + break; + })); + /* *INDENT-ON* */ +} + +int +nat64_alloc_out_addr_and_port (u32 fib_index, snat_protocol_t proto, + ip4_address_t * addr, u16 * port) +{ + nat64_main_t *nm = &nat64_main; + snat_main_t *sm = &snat_main; + int i; + snat_address_t *a, *ga = 0; + u32 portnum; + + for (i = 0; i < vec_len (nm->addr_pool); i++) + { + a = nm->addr_pool + i; + switch (proto) + { +#define _(N, j, n, s) \ + case SNAT_PROTOCOL_##N: \ + if (a->busy_##n##_ports < (65535-1024)) \ + { \ + if (a->fib_index == fib_index) \ + { \ + while (1) \ + { \ + portnum = random_u32 (&sm->random_seed); \ + portnum &= 0xFFFF; \ + if (portnum < 1024) \ + continue; \ + if (clib_bitmap_get_no_check (a->busy_##n##_port_bitmap, \ + portnum)) \ + continue; \ + clib_bitmap_set_no_check (a->busy_##n##_port_bitmap, \ + portnum, 1); \ + a->busy_##n##_ports++; \ + *port = portnum; \ + addr->as_u32 = a->addr.as_u32; \ + return 0; \ + } \ + } \ + else if (a->fib_index == 0) \ + ga = a; \ + } \ + break; + foreach_snat_protocol +#undef _ + default: + clib_warning ("unknown protocol"); + return 1; + } + } + + if (ga) + { + switch (proto) + { +#define _(N, j, n, s) \ + case SNAT_PROTOCOL_##N: \ + while (1) \ + { \ + portnum = random_u32 (&sm->random_seed); \ + portnum &= 0xFFFF; \ + if (portnum < 1024) \ + continue; \ + if (clib_bitmap_get_no_check (a->busy_##n##_port_bitmap, \ + portnum)) \ + continue; \ + clib_bitmap_set_no_check (a->busy_##n##_port_bitmap, \ + portnum, 1); \ + a->busy_##n##_ports++; \ + *port = portnum; \ + addr->as_u32 = a->addr.as_u32; \ + return 0; \ + } + break; + foreach_snat_protocol +#undef _ + default: + clib_warning ("unknown protocol"); + return 1; + } + } + + /* Totally out of translations to use... */ + //TODO: IPFix + return 1; +} + +void +nat64_free_out_addr_and_port (ip4_address_t * addr, u16 port, + snat_protocol_t proto) +{ + nat64_main_t *nm = &nat64_main; + int i; + snat_address_t *a; + + for (i = 0; i < vec_len (nm->addr_pool); i++) + { + a = nm->addr_pool + i; + if (addr->as_u32 != a->addr.as_u32) + continue; + switch (proto) + { +#define _(N, j, n, s) \ + case SNAT_PROTOCOL_##N: \ + ASSERT (clib_bitmap_get_no_check (a->busy_##n##_port_bitmap, \ + port) == 1); \ + clib_bitmap_set_no_check (a->busy_##n##_port_bitmap, port, 0); \ + a->busy_##n##_ports--; \ + break; + foreach_snat_protocol +#undef _ + default: + clib_warning ("unknown protocol"); + return; + } + break; + } +} + +int +nat64_add_del_static_bib_entry (ip6_address_t * in_addr, + ip4_address_t * out_addr, u16 in_port, + u16 out_port, u8 proto, u32 vrf_id, u8 is_add) +{ + nat64_main_t *nm = &nat64_main; + nat64_db_bib_entry_t *bibe; + u32 fib_index = fib_table_find_or_create_and_lock (FIB_PROTOCOL_IP6, vrf_id, + FIB_SOURCE_PLUGIN_HI); + snat_protocol_t p = ip_proto_to_snat_proto (proto); + ip46_address_t addr; + int i; + snat_address_t *a; + + addr.as_u64[0] = in_addr->as_u64[0]; + addr.as_u64[1] = in_addr->as_u64[1]; + bibe = + nat64_db_bib_entry_find (&nm->db, &addr, clib_host_to_net_u16 (in_port), + proto, fib_index, 1); + + if (is_add) + { + if (bibe) + return VNET_API_ERROR_VALUE_EXIST; + + for (i = 0; i < vec_len (nm->addr_pool); i++) + { + a = nm->addr_pool + i; + if (out_addr->as_u32 != a->addr.as_u32) + continue; + switch (p) + { +#define _(N, j, n, s) \ + case SNAT_PROTOCOL_##N: \ + if (clib_bitmap_get_no_check (a->busy_##n##_port_bitmap, \ + out_port)) \ + return VNET_API_ERROR_INVALID_VALUE; \ + clib_bitmap_set_no_check (a->busy_##n##_port_bitmap, \ + out_port, 1); \ + if (out_port > 1024) \ + a->busy_##n##_ports++; \ + break; + foreach_snat_protocol +#undef _ + default: + memset (&addr, 0, sizeof (addr)); + addr.ip4.as_u32 = out_addr->as_u32; + if (nat64_db_bib_entry_find + (&nm->db, &addr, 0, proto, fib_index, 0)) + return VNET_API_ERROR_INVALID_VALUE; + } + break; + } + bibe = + nat64_db_bib_entry_create (&nm->db, in_addr, out_addr, + clib_host_to_net_u16 (in_port), + clib_host_to_net_u16 (out_port), fib_index, + proto, 1); + if (!bibe) + return VNET_API_ERROR_UNSPECIFIED; + } + else + { + if (!bibe) + return VNET_API_ERROR_NO_SUCH_ENTRY; + + nat64_free_out_addr_and_port (out_addr, out_port, p); + nat64_db_bib_entry_free (&nm->db, bibe); + } + + return 0; +} + +int +nat64_set_udp_timeout (u32 timeout) +{ + nat64_main_t *nm = &nat64_main; + + if (timeout == 0) + nm->udp_timeout = SNAT_UDP_TIMEOUT; + else if (timeout < SNAT_UDP_TIMEOUT_MIN) + return VNET_API_ERROR_INVALID_VALUE; + else + nm->udp_timeout = timeout; + + return 0; +} + +u32 +nat64_get_udp_timeout (void) +{ + nat64_main_t *nm = &nat64_main; + + return nm->udp_timeout; +} + +int +nat64_set_icmp_timeout (u32 timeout) +{ + nat64_main_t *nm = &nat64_main; + + if (timeout == 0) + nm->icmp_timeout = SNAT_ICMP_TIMEOUT; + else + nm->icmp_timeout = timeout; + + return 0; +} + +u32 +nat64_get_icmp_timeout (void) +{ + nat64_main_t *nm = &nat64_main; + + return nm->icmp_timeout; +} + +int +nat64_set_tcp_timeouts (u32 trans, u32 est, u32 incoming_syn) +{ + nat64_main_t *nm = &nat64_main; + + if (trans == 0) + nm->tcp_trans_timeout = SNAT_TCP_TRANSITORY_TIMEOUT; + else + nm->tcp_trans_timeout = trans; + + if (est == 0) + nm->tcp_est_timeout = SNAT_TCP_ESTABLISHED_TIMEOUT; + else + nm->tcp_est_timeout = est; + + if (incoming_syn == 0) + nm->tcp_incoming_syn_timeout = SNAT_TCP_INCOMING_SYN; + else + nm->tcp_incoming_syn_timeout = incoming_syn; + + return 0; +} + +u32 +nat64_get_tcp_trans_timeout (void) +{ + nat64_main_t *nm = &nat64_main; + + return nm->tcp_trans_timeout; +} + +u32 +nat64_get_tcp_est_timeout (void) +{ + nat64_main_t *nm = &nat64_main; + + return nm->tcp_est_timeout; +} + +u32 +nat64_get_tcp_incoming_syn_timeout (void) +{ + nat64_main_t *nm = &nat64_main; + + return nm->tcp_incoming_syn_timeout; +} + +void +nat64_session_reset_timeout (nat64_db_st_entry_t * ste, vlib_main_t * vm) +{ + nat64_main_t *nm = &nat64_main; + u32 now = (u32) vlib_time_now (vm); + + switch (ip_proto_to_snat_proto (ste->proto)) + { + case SNAT_PROTOCOL_ICMP: + ste->expire = now + nm->icmp_timeout; + return; + case SNAT_PROTOCOL_TCP: + { + switch (ste->tcp_state) + { + case NAT64_TCP_STATE_V4_INIT: + case NAT64_TCP_STATE_V6_INIT: + case NAT64_TCP_STATE_V4_FIN_RCV: + case NAT64_TCP_STATE_V6_FIN_RCV: + case NAT64_TCP_STATE_V6_FIN_V4_FIN_RCV: + case NAT64_TCP_STATE_TRANS: + ste->expire = now + nm->tcp_trans_timeout; + return; + case NAT64_TCP_STATE_ESTABLISHED: + ste->expire = now + nm->tcp_est_timeout; + return; + default: + return; + } + } + case SNAT_PROTOCOL_UDP: + ste->expire = now + nm->udp_timeout; + return; + default: + ste->expire = now + nm->udp_timeout; + return; + } +} + +void +nat64_tcp_session_set_state (nat64_db_st_entry_t * ste, tcp_header_t * tcp, + u8 is_ip6) +{ + switch (ste->tcp_state) + { + case NAT64_TCP_STATE_CLOSED: + { + if (tcp->flags & TCP_FLAG_SYN) + { + if (is_ip6) + ste->tcp_state = NAT64_TCP_STATE_V6_INIT; + else + ste->tcp_state = NAT64_TCP_STATE_V4_INIT; + } + return; + } + case NAT64_TCP_STATE_V4_INIT: + { + if (is_ip6 && (tcp->flags & TCP_FLAG_SYN)) + ste->tcp_state = NAT64_TCP_STATE_ESTABLISHED; + return; + } + case NAT64_TCP_STATE_V6_INIT: + { + if (!is_ip6 && (tcp->flags & TCP_FLAG_SYN)) + ste->tcp_state = NAT64_TCP_STATE_ESTABLISHED; + return; + } + case NAT64_TCP_STATE_ESTABLISHED: + { + if (tcp->flags & TCP_FLAG_FIN) + { + if (is_ip6) + ste->tcp_state = NAT64_TCP_STATE_V6_FIN_RCV; + else + ste->tcp_state = NAT64_TCP_STATE_V4_FIN_RCV; + } + else if (tcp->flags & TCP_FLAG_RST) + { + ste->tcp_state = NAT64_TCP_STATE_TRANS; + } + return; + } + case NAT64_TCP_STATE_V4_FIN_RCV: + { + if (is_ip6 && (tcp->flags & TCP_FLAG_FIN)) + ste->tcp_state = NAT64_TCP_STATE_V6_FIN_V4_FIN_RCV; + return; + } + case NAT64_TCP_STATE_V6_FIN_RCV: + { + if (!is_ip6 && (tcp->flags & TCP_FLAG_FIN)) + ste->tcp_state = NAT64_TCP_STATE_V6_FIN_V4_FIN_RCV; + return; + } + case NAT64_TCP_STATE_TRANS: + { + if (!(tcp->flags & TCP_FLAG_RST)) + ste->tcp_state = NAT64_TCP_STATE_ESTABLISHED; + return; + } + default: + return; + } +} + +int +nat64_add_del_prefix (ip6_address_t * prefix, u8 plen, u32 vrf_id, u8 is_add) +{ + nat64_main_t *nm = &nat64_main; + nat64_prefix_t *p = 0; + int i; + + /* Verify prefix length */ + if (plen != 32 && plen != 40 && plen != 48 && plen != 56 && plen != 64 + && plen != 96) + return VNET_API_ERROR_INVALID_VALUE; + + /* Check if tenant already have prefix */ + for (i = 0; i < vec_len (nm->pref64); i++) + { + if (nm->pref64[i].vrf_id == vrf_id) + { + p = nm->pref64 + i; + break; + } + } + + if (is_add) + { + if (!p) + { + vec_add2 (nm->pref64, p, 1); + p->fib_index = + fib_table_find_or_create_and_lock (FIB_PROTOCOL_IP6, vrf_id, + FIB_SOURCE_PLUGIN_HI); + p->vrf_id = vrf_id; + } + + p->prefix.as_u64[0] = prefix->as_u64[0]; + p->prefix.as_u64[1] = prefix->as_u64[1]; + p->plen = plen; + } + else + { + if (!p) + return VNET_API_ERROR_NO_SUCH_ENTRY; + + vec_del1 (nm->pref64, i); + } + + return 0; +} + +void +nat64_prefix_walk (nat64_prefix_walk_fn_t fn, void *ctx) +{ + nat64_main_t *nm = &nat64_main; + nat64_prefix_t *p = 0; + + /* *INDENT-OFF* */ + vec_foreach (p, nm->pref64) + { + if (fn (p, ctx)) + break; + }; + /* *INDENT-ON* */ +} + +void +nat64_compose_ip6 (ip6_address_t * ip6, ip4_address_t * ip4, u32 fib_index) +{ + nat64_main_t *nm = &nat64_main; + nat64_prefix_t *p, *gp = 0, *prefix = 0; + + /* *INDENT-OFF* */ + vec_foreach (p, nm->pref64) + { + if (p->fib_index == fib_index) + { + prefix = p; + break; + } + + if (p->fib_index == 0) + gp = p; + }; + /* *INDENT-ON* */ + + if (!prefix) + prefix = gp; + + if (prefix) + { + memset (ip6, 0, 16); + memcpy (ip6, &p->prefix, p->plen); + switch (p->plen) + { + case 32: + ip6->as_u32[1] = ip4->as_u32; + break; + case 40: + ip6->as_u8[5] = ip4->as_u8[0]; + ip6->as_u8[6] = ip4->as_u8[1]; + ip6->as_u8[7] = ip4->as_u8[2]; + ip6->as_u8[9] = ip4->as_u8[3]; + break; + case 48: + ip6->as_u8[6] = ip4->as_u8[0]; + ip6->as_u8[7] = ip4->as_u8[1]; + ip6->as_u8[9] = ip4->as_u8[2]; + ip6->as_u8[10] = ip4->as_u8[3]; + break; + case 56: + ip6->as_u8[7] = ip4->as_u8[0]; + ip6->as_u8[9] = ip4->as_u8[1]; + ip6->as_u8[10] = ip4->as_u8[2]; + ip6->as_u8[11] = ip4->as_u8[3]; + break; + case 64: + ip6->as_u8[9] = ip4->as_u8[0]; + ip6->as_u8[10] = ip4->as_u8[1]; + ip6->as_u8[11] = ip4->as_u8[2]; + ip6->as_u8[12] = ip4->as_u8[3]; + break; + case 96: + ip6->as_u32[3] = ip4->as_u32; + break; + default: + clib_warning ("invalid prefix length"); + break; + } + } + else + { + memcpy (ip6, well_known_prefix, 16); + ip6->as_u32[3] = ip4->as_u32; + } +} + +void +nat64_extract_ip4 (ip6_address_t * ip6, ip4_address_t * ip4, u32 fib_index) +{ + nat64_main_t *nm = &nat64_main; + nat64_prefix_t *p, *gp = 0; + u8 plen = 0; + + /* *INDENT-OFF* */ + vec_foreach (p, nm->pref64) + { + if (p->fib_index == fib_index) + { + plen = p->plen; + break; + } + + if (p->vrf_id == 0) + gp = p; + }; + /* *INDENT-ON* */ + + if (!plen) + { + if (gp) + plen = gp->plen; + else + plen = 96; + } + + switch (plen) + { + case 32: + ip4->as_u32 = ip6->as_u32[1]; + break; + case 40: + ip4->as_u8[0] = ip6->as_u8[5]; + ip4->as_u8[1] = ip6->as_u8[6]; + ip4->as_u8[2] = ip6->as_u8[7]; + ip4->as_u8[3] = ip6->as_u8[9]; + break; + case 48: + ip4->as_u8[0] = ip6->as_u8[6]; + ip4->as_u8[1] = ip6->as_u8[7]; + ip4->as_u8[2] = ip6->as_u8[9]; + ip4->as_u8[3] = ip6->as_u8[10]; + break; + case 56: + ip4->as_u8[0] = ip6->as_u8[7]; + ip4->as_u8[1] = ip6->as_u8[9]; + ip4->as_u8[2] = ip6->as_u8[10]; + ip4->as_u8[3] = ip6->as_u8[11]; + break; + case 64: + ip4->as_u8[0] = ip6->as_u8[9]; + ip4->as_u8[1] = ip6->as_u8[10]; + ip4->as_u8[2] = ip6->as_u8[11]; + ip4->as_u8[3] = ip6->as_u8[12]; + break; + case 96: + ip4->as_u32 = ip6->as_u32[3]; + break; + default: + clib_warning ("invalid prefix length"); + break; + } +} + +/** + * @brief The 'nat64-expire-walk' process's main loop. + * + * Check expire time for NAT64 sessions. + */ +static uword +nat64_expire_walk_fn (vlib_main_t * vm, vlib_node_runtime_t * rt, + vlib_frame_t * f) +{ + nat64_main_t *nm = &nat64_main; + + while (!nm->is_disabled) + { + vlib_process_wait_for_event_or_clock (vm, 10.0); + vlib_process_get_events (vm, NULL); + u32 now = (u32) vlib_time_now (vm); + + nad64_db_st_free_expired (&nm->db, now); + } + + return 0; +} + +static vlib_node_registration_t nat64_expire_walk_node; + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (nat64_expire_walk_node, static) = { + .function = nat64_expire_walk_fn, + .type = VLIB_NODE_TYPE_PROCESS, + .name = "nat64-expire-walk", +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/nat/nat64.h b/src/plugins/nat/nat64.h new file mode 100644 index 00000000..68224cab --- /dev/null +++ b/src/plugins/nat/nat64.h @@ -0,0 +1,322 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @file + * @brief NAT64 global declarations + */ +#ifndef __included_nat64_h__ +#define __included_nat64_h__ + +#include <nat/nat.h> +#include <nat/nat64_db.h> + +#define foreach_nat64_tcp_ses_state \ + _(0, CLOSED, "closed") \ + _(1, V4_INIT, "v4-init") \ + _(2, V6_INIT, "v6-init") \ + _(3, ESTABLISHED, "established") \ + _(4, V4_FIN_RCV, "v4-fin-rcv") \ + _(5, V6_FIN_RCV, "v6-fin-rcv") \ + _(6, V6_FIN_V4_FIN_RCV, "v6-fin-v4-fin-rcv") \ + _(7, TRANS, "trans") + +typedef enum +{ +#define _(v, N, s) NAT64_TCP_STATE_##N = v, + foreach_nat64_tcp_ses_state +#undef _ +} nat64_tcp_ses_state_t; + +typedef struct +{ + ip6_address_t prefix; + u8 plen; + u32 vrf_id; + u32 fib_index; +} nat64_prefix_t; + +typedef struct +{ + /** Interface pool */ + snat_interface_t *interfaces; + + /** Address pool vector */ + snat_address_t *addr_pool; + + /** Pref64 vector */ + nat64_prefix_t *pref64; + + /** BIB and session DB */ + nat64_db_t db; + + /* values of various timeouts */ + u32 udp_timeout; + u32 icmp_timeout; + u32 tcp_trans_timeout; + u32 tcp_est_timeout; + u32 tcp_incoming_syn_timeout; + + u8 is_disabled; + + snat_main_t *sm; +} nat64_main_t; + +extern nat64_main_t nat64_main; +extern vlib_node_registration_t nat64_in2out_node; +extern vlib_node_registration_t nat64_out2in_node; + +/** + * @brief Add/delete address to NAT64 pool. + * + * @param addr IPv4 address. + * @param vrf_id VRF id of tenant, ~0 means independent of VRF. + * @param is_add 1 if add, 0 if delete. + * + * @returns 0 on success, non-zero value otherwise. + */ +int nat64_add_del_pool_addr (ip4_address_t * addr, u32 vrf_id, u8 is_add); + +/** + * @brief Call back function when walking addresses in NAT64 pool, non-zero + * return value stop walk. + */ +typedef int (*nat64_pool_addr_walk_fn_t) (snat_address_t * addr, void *ctx); + +/** + * @brief Walk NAT64 pool. + * + * @param fn The function to invoke on each entry visited. + * @param ctx A context passed in the visit function. + */ +void nat64_pool_addr_walk (nat64_pool_addr_walk_fn_t fn, void *ctx); + +/** + * @brief Enable/disable NAT64 feature on the interface. + * + * @param sw_if_index Index of the interface. + * @param is_inside 1 if inside, 0 if outside. + * @param is_add 1 if add, 0 if delete. + * + * @returns 0 on success, non-zero value otherwise. + */ +int nat64_add_del_interface (u32 sw_if_index, u8 is_inside, u8 is_add); + +/** + * @brief Call back function when walking interfaces with NAT64 feature, + * non-zero return value stop walk. + */ +typedef int (*nat64_interface_walk_fn_t) (snat_interface_t * i, void *ctx); + +/** + * @brief Walk NAT64 interfaces. + * + * @param fn The function to invoke on each entry visited. + * @param ctx A context passed in the visit function. + */ +void nat64_interfaces_walk (nat64_interface_walk_fn_t fn, void *ctx); + +/** + * @brief Initialize NAT64. + * + * @param vm vlib main. + * + * @return error code. + */ +clib_error_t *nat64_init (vlib_main_t * vm); + +/** + * @brief Add/delete static NAT64 BIB entry. + * + * @param in_addr Inside IPv6 address. + * @param out_addr Outside IPv4 address. + * @param in_port Inside port number. + * @param out_port Outside port number. + * @param proto L4 protocol. + * @param vrf_id VRF id of tenant. + * @param is_add 1 if add, 0 if delete. + * + * @returns 0 on success, non-zero value otherwise. + */ +int nat64_add_del_static_bib_entry (ip6_address_t * in_addr, + ip4_address_t * out_addr, u16 in_port, + u16 out_port, u8 proto, u32 vrf_id, + u8 is_add); + +/** + * @brief Alloce IPv4 address and port pair from NAT64 pool. + * + * @param fib_index FIB index of tenant. + * @param proto L4 protocol. + * @param addr Allocated IPv4 address. + * @param port Allocated port number. + * + * @returns 0 on success, non-zero value otherwise. + */ +int nat64_alloc_out_addr_and_port (u32 fib_index, snat_protocol_t proto, + ip4_address_t * addr, u16 * port); + +/** + * @brief Free IPv4 address and port pair from NAT64 pool. + * + * @param addr IPv4 address to free. + * @param port Port number to free. + * @param proto L4 protocol. + * + * @returns 0 on success, non-zero value otherwise. + */ +void nat64_free_out_addr_and_port (ip4_address_t * addr, u16 port, + snat_protocol_t proto); + +/** + * @brief Set UDP session timeout. + * + * @param timeout Timeout value in seconds (if 0 reset to default value 300sec). + * + * @returns 0 on success, non-zero value otherwise. + */ +int nat64_set_udp_timeout (u32 timeout); + +/** + * @brief Get UDP session timeout. + * + * @returns UDP session timeout in seconds. + */ +u32 nat64_get_udp_timeout (void); + +/** + * @brief Set ICMP session timeout. + * + * @param timeout Timeout value in seconds (if 0 reset to default value 60sec). + * + * @returns 0 on success, non-zero value otherwise. + */ +int nat64_set_icmp_timeout (u32 timeout); + +/** + * @brief Get ICMP session timeout. + * + * @returns ICMP session timeout in seconds. + */ +u32 nat64_get_icmp_timeout (void); + +/** + * @brief Set TCP session timeouts. + * + * @param trans Transitory timeout in seconds (if 0 reset to default value 240sec). + * @param est Established timeout in seconds (if 0 reset to default value 7440sec). + * @param incoming_syn Incoming SYN timeout in seconds (if 0 reset to default value 6sec). + * + * @returns 0 on success, non-zero value otherwise. + */ +int nat64_set_tcp_timeouts (u32 trans, u32 est, u32 incoming_syn); + +/** + * @brief Get TCP transitory timeout. + * + * @returns TCP transitory timeout in seconds. + */ +u32 nat64_get_tcp_trans_timeout (void); + +/** + * @brief Get TCP established timeout. + * + * @returns TCP established timeout in seconds. + */ +u32 nat64_get_tcp_est_timeout (void); + +/** + * @brief Get TCP incoming SYN timeout. + * + * @returns TCP incoming SYN timeout in seconds. + */ +u32 nat64_get_tcp_incoming_syn_timeout (void); + +/** + * @brief Reset NAT64 session timeout. + * + * @param ste Session table entry. + * @param vm VLIB main. + **/ +void nat64_session_reset_timeout (nat64_db_st_entry_t * ste, + vlib_main_t * vm); + +/** + * @brief Set NAT64 TCP session state. + * + * @param ste Session table entry. + * @param tcp TCP header. + * @param is_ip6 1 if IPv6 packet, 0 if IPv4. + */ +void nat64_tcp_session_set_state (nat64_db_st_entry_t * ste, + tcp_header_t * tcp, u8 is_ip6); + +/** + * @brief Add/delete NAT64 prefix. + * + * @param prefix NAT64 prefix. + * @param plen Prefix length. + * @param vrf_id VRF id of tenant. + * @param is_add 1 if add, 0 if delete. + * + * @returns 0 on success, non-zero value otherwise. + */ +int nat64_add_del_prefix (ip6_address_t * prefix, u8 plen, u32 vrf_id, + u8 is_add); + +/** + * @brief Call back function when walking addresses in NAT64 prefixes, non-zero + * return value stop walk. + */ +typedef int (*nat64_prefix_walk_fn_t) (nat64_prefix_t * pref64, void *ctx); + +/** + * @brief Walk NAT64 prefixes. + * + * @param fn The function to invoke on each entry visited. + * @param ctx A context passed in the visit function. + */ +void nat64_prefix_walk (nat64_prefix_walk_fn_t fn, void *ctx); + +/** + * Compose IPv4-embedded IPv6 addresses. + * @param ip6 IPv4-embedded IPv6 addresses. + * @param ip4 IPv4 address. + * @param fib_index Tenant FIB index. + */ +void nat64_compose_ip6 (ip6_address_t * ip6, ip4_address_t * ip4, + u32 fib_index); + +/** + * Extract IPv4 address from the IPv4-embedded IPv6 addresses. + * + * @param ip6 IPv4-embedded IPv6 addresses. + * @param ip4 IPv4 address. + * @param fib_index Tenant FIB index. + */ +void nat64_extract_ip4 (ip6_address_t * ip6, ip4_address_t * ip4, + u32 fib_index); + +#define u8_ptr_add(ptr, index) (((u8 *)ptr) + index) +#define u16_net_add(u, val) clib_host_to_net_u16(clib_net_to_host_u16(u) + (val)) + +#endif /* __included_nat64_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/nat/nat64_cli.c b/src/plugins/nat/nat64_cli.c new file mode 100644 index 00000000..f3645bbb --- /dev/null +++ b/src/plugins/nat/nat64_cli.c @@ -0,0 +1,984 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @file + * @brief NAT64 CLI + */ + +#include <nat/nat64.h> +#include <nat/nat.h> +#include <vnet/fib/fib_table.h> + +static clib_error_t * +nat64_add_del_pool_addr_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + nat64_main_t *nm = &nat64_main; + unformat_input_t _line_input, *line_input = &_line_input; + ip4_address_t start_addr, end_addr, this_addr; + u32 start_host_order, end_host_order; + int i, count, rv; + u32 vrf_id = ~0; + u8 is_add = 1; + clib_error_t *error = 0; + + if (nm->is_disabled) + return clib_error_return (0, + "NAT64 disabled, multi thread not supported"); + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "%U - %U", + unformat_ip4_address, &start_addr, + unformat_ip4_address, &end_addr)) + ; + else if (unformat (line_input, "tenant-vrf %u", &vrf_id)) + ; + else if (unformat (line_input, "%U", unformat_ip4_address, &start_addr)) + end_addr = start_addr; + else if (unformat (line_input, "del")) + is_add = 0; + else + { + error = clib_error_return (0, "unknown input '%U'", + format_unformat_error, line_input); + goto done; + } + } + + start_host_order = clib_host_to_net_u32 (start_addr.as_u32); + end_host_order = clib_host_to_net_u32 (end_addr.as_u32); + + if (end_host_order < start_host_order) + { + error = clib_error_return (0, "end address less than start address"); + goto done; + } + + count = (end_host_order - start_host_order) + 1; + this_addr = start_addr; + + for (i = 0; i < count; i++) + { + rv = nat64_add_del_pool_addr (&this_addr, vrf_id, is_add); + + switch (rv) + { + case VNET_API_ERROR_NO_SUCH_ENTRY: + error = + clib_error_return (0, "NAT64 pool address %U not exist.", + format_ip4_address, &this_addr); + goto done; + case VNET_API_ERROR_VALUE_EXIST: + error = + clib_error_return (0, "NAT64 pool address %U exist.", + format_ip4_address, &this_addr); + goto done; + default: + break; + + } + increment_v4_address (&this_addr); + } + +done: + unformat_free (line_input); + + return error; +} + +static int +nat64_cli_pool_walk (snat_address_t * ap, void *ctx) +{ + vlib_main_t *vm = ctx; + + if (ap->fib_index != ~0) + { + fib_table_t *fib; + fib = fib_table_get (ap->fib_index, FIB_PROTOCOL_IP6); + if (!fib) + return -1; + vlib_cli_output (vm, " %U tenant VRF: %u", format_ip4_address, + &ap->addr, fib->ft_table_id); + } + else + vlib_cli_output (vm, " %U", format_ip4_address, &ap->addr); + + return 0; +} + +static clib_error_t * +nat64_show_pool_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + nat64_main_t *nm = &nat64_main; + + if (nm->is_disabled) + return clib_error_return (0, + "NAT64 disabled, multi thread not supported"); + + vlib_cli_output (vm, "NAT64 pool:"); + nat64_pool_addr_walk (nat64_cli_pool_walk, vm); + + return 0; +} + +static clib_error_t * +nat64_interface_feature_command_fn (vlib_main_t * vm, + unformat_input_t * + input, vlib_cli_command_t * cmd) +{ + nat64_main_t *nm = &nat64_main; + unformat_input_t _line_input, *line_input = &_line_input; + vnet_main_t *vnm = vnet_get_main (); + clib_error_t *error = 0; + u32 sw_if_index; + u32 *inside_sw_if_indices = 0; + u32 *outside_sw_if_indices = 0; + u8 is_add = 1; + int i, rv; + + if (nm->is_disabled) + return clib_error_return (0, + "NAT64 disabled, multi thread not supported"); + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "in %U", unformat_vnet_sw_interface, + vnm, &sw_if_index)) + vec_add1 (inside_sw_if_indices, sw_if_index); + else if (unformat (line_input, "out %U", unformat_vnet_sw_interface, + vnm, &sw_if_index)) + vec_add1 (outside_sw_if_indices, sw_if_index); + else if (unformat (line_input, "del")) + is_add = 0; + else + { + error = clib_error_return (0, "unknown input '%U'", + format_unformat_error, line_input); + goto done; + } + } + + if (vec_len (inside_sw_if_indices)) + { + for (i = 0; i < vec_len (inside_sw_if_indices); i++) + { + sw_if_index = inside_sw_if_indices[i]; + rv = nat64_add_del_interface (sw_if_index, 1, is_add); + switch (rv) + { + case VNET_API_ERROR_NO_SUCH_ENTRY: + error = + clib_error_return (0, "%U NAT64 feature not enabled.", + format_vnet_sw_interface_name, vnm, + vnet_get_sw_interface (vnm, sw_if_index)); + goto done; + case VNET_API_ERROR_VALUE_EXIST: + error = + clib_error_return (0, "%U NAT64 feature already enabled.", + format_vnet_sw_interface_name, vnm, + vnet_get_sw_interface (vnm, sw_if_index)); + goto done; + case VNET_API_ERROR_INVALID_VALUE: + case VNET_API_ERROR_INVALID_VALUE_2: + error = + clib_error_return (0, + "%U NAT64 feature enable/disable failed.", + format_vnet_sw_interface_name, vnm, + vnet_get_sw_interface (vnm, sw_if_index)); + goto done; + default: + break; + + } + } + } + + if (vec_len (outside_sw_if_indices)) + { + for (i = 0; i < vec_len (outside_sw_if_indices); i++) + { + sw_if_index = outside_sw_if_indices[i]; + rv = nat64_add_del_interface (sw_if_index, 0, is_add); + switch (rv) + { + case VNET_API_ERROR_NO_SUCH_ENTRY: + error = + clib_error_return (0, "%U NAT64 feature not enabled.", + format_vnet_sw_interface_name, vnm, + vnet_get_sw_interface (vnm, sw_if_index)); + goto done; + case VNET_API_ERROR_VALUE_EXIST: + error = + clib_error_return (0, "%U NAT64 feature already enabled.", + format_vnet_sw_interface_name, vnm, + vnet_get_sw_interface (vnm, sw_if_index)); + goto done; + case VNET_API_ERROR_INVALID_VALUE: + case VNET_API_ERROR_INVALID_VALUE_2: + error = + clib_error_return (0, + "%U NAT64 feature enable/disable failed.", + format_vnet_sw_interface_name, vnm, + vnet_get_sw_interface (vnm, sw_if_index)); + goto done; + default: + break; + + } + } + } + +done: + unformat_free (line_input); + vec_free (inside_sw_if_indices); + vec_free (outside_sw_if_indices); + + return error; +} + +static int +nat64_cli_interface_walk (snat_interface_t * i, void *ctx) +{ + vlib_main_t *vm = ctx; + vnet_main_t *vnm = vnet_get_main (); + + vlib_cli_output (vm, " %U %s", format_vnet_sw_interface_name, vnm, + vnet_get_sw_interface (vnm, i->sw_if_index), + i->is_inside ? "in" : "out"); + return 0; +} + +static clib_error_t * +nat64_show_interfaces_command_fn (vlib_main_t * vm, + unformat_input_t * + input, vlib_cli_command_t * cmd) +{ + nat64_main_t *nm = &nat64_main; + + if (nm->is_disabled) + return clib_error_return (0, + "NAT64 disabled, multi thread not supported"); + + vlib_cli_output (vm, "NAT64 interfaces:"); + nat64_interfaces_walk (nat64_cli_interface_walk, vm); + + return 0; +} + +static clib_error_t * +nat64_add_del_static_bib_command_fn (vlib_main_t * + vm, + unformat_input_t + * input, vlib_cli_command_t * cmd) +{ + nat64_main_t *nm = &nat64_main; + unformat_input_t _line_input, *line_input = &_line_input; + clib_error_t *error = 0; + u8 is_add = 1; + ip6_address_t in_addr; + ip4_address_t out_addr; + u32 in_port = 0; + u32 out_port = 0; + u32 vrf_id = 0, protocol; + snat_protocol_t proto = 0; + u8 p = 0; + int rv; + + if (nm->is_disabled) + return clib_error_return (0, + "NAT64 disabled, multi thread not supported"); + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "%U %u", unformat_ip6_address, + &in_addr, &in_port)) + ; + else if (unformat (line_input, "%U %u", unformat_ip4_address, + &out_addr, &out_port)) + ; + else if (unformat (line_input, "vrf %u", &vrf_id)) + ; + else if (unformat (line_input, "%U", unformat_snat_protocol, &proto)) + ; + else + if (unformat + (line_input, "%U %U %u", unformat_ip6_address, &in_addr, + unformat_ip4_address, &out_addr, &protocol)) + p = (u8) protocol; + else if (unformat (line_input, "del")) + is_add = 0; + else + { + error = clib_error_return (0, "unknown input: '%U'", + format_unformat_error, line_input); + goto done; + } + } + + if (!p) + { + if (!in_port) + { + error = + clib_error_return (0, "inside port and address must be set"); + goto done; + } + + if (!out_port) + { + error = + clib_error_return (0, "outside port and address must be set"); + goto done; + } + + p = snat_proto_to_ip_proto (proto); + } + + rv = + nat64_add_del_static_bib_entry (&in_addr, &out_addr, (u16) in_port, + (u16) out_port, p, vrf_id, is_add); + + switch (rv) + { + case VNET_API_ERROR_NO_SUCH_ENTRY: + error = clib_error_return (0, "NAT64 BIB entry not exist."); + goto done; + case VNET_API_ERROR_VALUE_EXIST: + error = clib_error_return (0, "NAT64 BIB entry exist."); + goto done; + case VNET_API_ERROR_UNSPECIFIED: + error = clib_error_return (0, "Crerate NAT64 BIB entry failed."); + goto done; + case VNET_API_ERROR_INVALID_VALUE: + error = + clib_error_return (0, "Outside addres %U and port %u already in use.", + format_ip4_address, &out_addr, out_port); + goto done; + default: + break; + } + +done: + unformat_free (line_input); + + return error; +} + +static int +nat64_cli_bib_walk (nat64_db_bib_entry_t * bibe, void *ctx) +{ + vlib_main_t *vm = ctx; + fib_table_t *fib; + + fib = fib_table_get (bibe->fib_index, FIB_PROTOCOL_IP6); + if (!fib) + return -1; + + switch (bibe->proto) + { + case IP_PROTOCOL_ICMP: + case IP_PROTOCOL_TCP: + case IP_PROTOCOL_UDP: + vlib_cli_output (vm, " %U %u %U %u protocol %U vrf %u %s %u sessions", + format_ip6_address, &bibe->in_addr, + clib_net_to_host_u16 (bibe->in_port), + format_ip4_address, &bibe->out_addr, + clib_net_to_host_u16 (bibe->out_port), + format_snat_protocol, + ip_proto_to_snat_proto (bibe->proto), fib->ft_table_id, + bibe->is_static ? "static" : "dynamic", bibe->ses_num); + break; + default: + vlib_cli_output (vm, " %U %U protocol %u vrf %u %s %u sessions", + format_ip6_address, &bibe->in_addr, + format_ip4_address, &bibe->out_addr, + bibe->proto, fib->ft_table_id, + bibe->is_static ? "static" : "dynamic", bibe->ses_num); + } + return 0; +} + +static clib_error_t * +nat64_show_bib_command_fn (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + nat64_main_t *nm = &nat64_main; + unformat_input_t _line_input, *line_input = &_line_input; + clib_error_t *error = 0; + u32 proto = ~0; + u8 p = 255; + + if (nm->is_disabled) + return clib_error_return (0, + "NAT64 disabled, multi thread not supported"); + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + if (unformat (line_input, "%U", unformat_snat_protocol, &proto)) + p = snat_proto_to_ip_proto (proto); + else if (unformat (line_input, "unknown")) + p = 0; + else if (unformat (line_input, "all")) + ; + else + { + error = clib_error_return (0, "unknown input: '%U'", + format_unformat_error, line_input); + goto done; + } + + if (p == 255) + vlib_cli_output (vm, "NAT64 BIB entries:"); + else + vlib_cli_output (vm, "NAT64 %U BIB entries:", format_snat_protocol, + proto); + nat64_db_bib_walk (&nm->db, p, nat64_cli_bib_walk, vm); + +done: + unformat_free (line_input); + + return error; +} + +static clib_error_t * +nat64_set_timeouts_command_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + nat64_main_t *nm = &nat64_main; + unformat_input_t _line_input, *line_input = &_line_input; + clib_error_t *error = 0; + u32 timeout, tcp_trans, tcp_est, tcp_incoming_syn; + + tcp_trans = nat64_get_tcp_trans_timeout (); + tcp_est = nat64_get_tcp_est_timeout (); + tcp_incoming_syn = nat64_get_tcp_incoming_syn_timeout (); + + if (nm->is_disabled) + return clib_error_return (0, + "NAT64 disabled, multi thread not supported"); + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "udp %u", &timeout)) + { + if (nat64_set_udp_timeout (timeout)) + { + error = clib_error_return (0, "Invalid UDP timeout value"); + goto done; + } + } + else if (unformat (line_input, "icmp %u", &timeout)) + { + if (nat64_set_icmp_timeout (timeout)) + { + error = clib_error_return (0, "Invalid ICMP timeout value"); + goto done; + } + } + else if (unformat (line_input, "tcp-trans %u", &tcp_trans)) + { + if (nat64_set_tcp_timeouts (tcp_trans, tcp_est, tcp_incoming_syn)) + { + error = + clib_error_return (0, + "Invalid TCP transitory timeouts value"); + goto done; + } + } + else if (unformat (line_input, "tcp-est %u", &tcp_est)) + { + if (nat64_set_tcp_timeouts (tcp_trans, tcp_est, tcp_incoming_syn)) + { + error = + clib_error_return (0, + "Invalid TCP established timeouts value"); + goto done; + } + } + else + if (unformat (line_input, "tcp-incoming-syn %u", &tcp_incoming_syn)) + { + if (nat64_set_tcp_timeouts (tcp_trans, tcp_est, tcp_incoming_syn)) + { + error = + clib_error_return (0, + "Invalid TCP incoming SYN timeouts value"); + goto done; + } + } + else if (unformat (line_input, "reset")) + { + nat64_set_udp_timeout (0); + nat64_set_icmp_timeout (0); + nat64_set_tcp_timeouts (0, 0, 0); + } + else + { + error = clib_error_return (0, "unknown input '%U'", + format_unformat_error, line_input); + goto done; + } + } + +done: + unformat_free (line_input); + + return error; +} + +static clib_error_t * +nat64_show_timeouts_command_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + nat64_main_t *nm = &nat64_main; + + if (nm->is_disabled) + return clib_error_return (0, + "NAT64 disabled, multi thread not supported"); + + vlib_cli_output (vm, "NAT64 session timeouts:"); + vlib_cli_output (vm, " UDP %usec", nat64_get_udp_timeout ()); + vlib_cli_output (vm, " ICMP %usec", nat64_get_icmp_timeout ()); + vlib_cli_output (vm, " TCP transitory %usec", + nat64_get_tcp_trans_timeout ()); + vlib_cli_output (vm, " TCP established %usec", + nat64_get_tcp_est_timeout ()); + vlib_cli_output (vm, " TCP incoming SYN %usec", + nat64_get_tcp_incoming_syn_timeout ()); + + return 0; +} + +static int +nat64_cli_st_walk (nat64_db_st_entry_t * ste, void *ctx) +{ + vlib_main_t *vm = ctx; + nat64_main_t *nm = &nat64_main; + nat64_db_bib_entry_t *bibe; + fib_table_t *fib; + + bibe = nat64_db_bib_entry_by_index (&nm->db, ste->proto, ste->bibe_index); + if (!bibe) + return -1; + + fib = fib_table_get (bibe->fib_index, FIB_PROTOCOL_IP6); + if (!fib) + return -1; + + u32 vrf_id = fib->ft_table_id; + + if (ste->proto == IP_PROTOCOL_ICMP) + vlib_cli_output (vm, " %U %U %u %U %U %u protocol %U vrf %u", + format_ip6_address, &bibe->in_addr, + format_ip6_address, &ste->in_r_addr, + clib_net_to_host_u16 (bibe->in_port), + format_ip4_address, &bibe->out_addr, + format_ip4_address, &ste->out_r_addr, + clib_net_to_host_u16 (bibe->out_port), + format_snat_protocol, + ip_proto_to_snat_proto (bibe->proto), vrf_id); + else if (ste->proto == IP_PROTOCOL_TCP || ste->proto == IP_PROTOCOL_UDP) + vlib_cli_output (vm, " %U %u %U %u %U %u %U %u protcol %U vrf %u", + format_ip6_address, &bibe->in_addr, + clib_net_to_host_u16 (bibe->in_port), + format_ip6_address, &ste->in_r_addr, + clib_net_to_host_u16 (ste->r_port), + format_ip4_address, &bibe->out_addr, + clib_net_to_host_u16 (bibe->out_port), + format_ip4_address, &ste->out_r_addr, + clib_net_to_host_u16 (ste->r_port), + format_snat_protocol, + ip_proto_to_snat_proto (bibe->proto), vrf_id); + else + vlib_cli_output (vm, " %U %U %U %U protocol %u vrf %u", + format_ip6_address, &bibe->in_addr, + format_ip6_address, &ste->in_r_addr, + format_ip4_address, &bibe->out_addr, + format_ip4_address, &ste->out_r_addr, + bibe->proto, vrf_id); + + return 0; +} + +static clib_error_t * +nat64_show_st_command_fn (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + nat64_main_t *nm = &nat64_main; + unformat_input_t _line_input, *line_input = &_line_input; + clib_error_t *error = 0; + u32 proto = ~0; + u8 p = 255; + + if (nm->is_disabled) + return clib_error_return (0, + "NAT64 disabled, multi thread not supported"); + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + if (unformat (line_input, "%U", unformat_snat_protocol, &proto)) + p = snat_proto_to_ip_proto (proto); + else if (unformat (line_input, "unknown")) + p = 0; + else if (unformat (line_input, "all")) + ; + else + { + error = clib_error_return (0, "unknown input: '%U'", + format_unformat_error, line_input); + goto done; + } + + if (p == 255) + vlib_cli_output (vm, "NAT64 sessions:"); + else + vlib_cli_output (vm, "NAT64 %U sessions:", format_snat_protocol, proto); + nat64_db_st_walk (&nm->db, p, nat64_cli_st_walk, vm); + +done: + unformat_free (line_input); + + return error; +} + +static clib_error_t * +nat64_add_del_prefix_command_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + nat64_main_t *nm = &nat64_main; + clib_error_t *error = 0; + unformat_input_t _line_input, *line_input = &_line_input; + u8 is_add = 1; + u32 vrf_id = 0; + ip6_address_t prefix; + u32 plen = 0; + int rv; + + if (nm->is_disabled) + return clib_error_return (0, + "NAT64 disabled, multi thread not supported"); + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat + (line_input, "%U/%u", unformat_ip6_address, &prefix, &plen)) + ; + else if (unformat (line_input, "tenant-vrf %u", &vrf_id)) + ; + else if (unformat (line_input, "del")) + is_add = 0; + else + { + error = clib_error_return (0, "unknown input: '%U'", + format_unformat_error, line_input); + goto done; + } + } + + if (!plen) + { + error = clib_error_return (0, "NAT64 prefix must be set."); + goto done; + } + + rv = nat64_add_del_prefix (&prefix, (u8) plen, vrf_id, is_add); + + switch (rv) + { + case VNET_API_ERROR_NO_SUCH_ENTRY: + error = clib_error_return (0, "NAT64 prefix not exist."); + goto done; + case VNET_API_ERROR_INVALID_VALUE: + error = clib_error_return (0, "Invalid prefix length."); + goto done; + default: + break; + } + +done: + unformat_free (line_input); + + return error; +} + +static int +nat64_cli_prefix_walk (nat64_prefix_t * p, void *ctx) +{ + vlib_main_t *vm = ctx; + + vlib_cli_output (vm, " %U/%u tenant-vrf %u", + format_ip6_address, &p->prefix, p->plen, p->vrf_id); + + return 0; +} + +static clib_error_t * +nat64_show_prefix_command_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + nat64_main_t *nm = &nat64_main; + + if (nm->is_disabled) + return clib_error_return (0, + "NAT64 disabled, multi thread not supported"); + + vlib_cli_output (vm, "NAT64 prefix:"); + nat64_prefix_walk (nat64_cli_prefix_walk, vm); + + return 0; +} + +/* *INDENT-OFF* */ + +/*? + * @cliexpar + * @cliexstart{nat64 add pool address} + * Add/delete NAT64 pool address. + * To add single NAT64 pool address use: + * vpp# nat64 add pool address 10.1.1.10 + * To add NAT64 pool address range use: + * vpp# nat64 add pool address 10.1.1.2 - 10.1.1.5 + * To add NAT64 pool address for specific tenant use: + * vpp# nat64 add pool address 10.1.1.100 tenant-vrf 100 + * @cliexend +?*/ +VLIB_CLI_COMMAND (nat64_add_pool_address_command, static) = { + .path = "nat64 add pool address", + .short_help = "nat64 add pool address <ip4-range-start> [- <ip4-range-end>] " + "[tenant-vrf <vrf-id>] [del]", + .function = nat64_add_del_pool_addr_command_fn, +}; + +/*? + * @cliexpar + * @cliexstart{show nat64 pool} + * Show NAT64 pool. + * vpp# show nat64 pool + * NAT64 pool: + * 10.1.1.3 tenant VRF: 0 + * 10.1.1.10 tenant VRF: 10 + * @cliexend +?*/ +VLIB_CLI_COMMAND (show_nat64_pool_command, static) = { + .path = "show nat64 pool", + .short_help = "show nat64 pool", + .function = nat64_show_pool_command_fn, +}; + +/*? + * @cliexpar + * @cliexstart{set interface nat64} + * Enable/disable NAT64 feature on the interface. + * To enable NAT64 feature with local (IPv6) network interface + * GigabitEthernet0/8/0 and external (IPv4) network interface + * GigabitEthernet0/a/0 use: + * vpp# set interface nat64 in GigabitEthernet0/8/0 out GigabitEthernet0/a/0 + * @cliexend +?*/ +VLIB_CLI_COMMAND (set_interface_nat64_command, static) = { + .path = "set interface nat64", + .short_help = "set interface nat64 in|out <intfc> [del]", + .function = nat64_interface_feature_command_fn, +}; + +/*? + * @cliexpar + * @cliexstart{show nat64 interfaces} + * Show interfaces with NAT64 feature. + * To show interfaces with NAT64 feature use: + * vpp# show nat64 interfaces + * NAT64 interfaces: + * GigabitEthernet0/8/0 in + * GigabitEthernet0/a/0 out + * @cliexend +?*/ +VLIB_CLI_COMMAND (show_nat64_interfaces_command, static) = { + .path = "show nat64 interfaces", + .short_help = "show nat64 interfaces", + .function = nat64_show_interfaces_command_fn, +}; + +/*? + * @cliexpar + * @cliexstart{nat64 add static bib} + * Add/delete NAT64 static BIB entry. + * To create NAT64 satatic BIB entry use: + * vpp# nat64 add static bib 2001:db8:c000:221:: 1234 10.1.1.3 5678 tcp + * vpp# nat64 add static bib 2001:db8:c000:221:: 1234 10.1.1.3 5678 udp vrf 10 + * @cliexend +?*/ +VLIB_CLI_COMMAND (nat64_add_del_static_bib_command, static) = { + .path = "nat64 add static bib", + .short_help = "nat64 add static bib <ip6-addr> <port> <ip4-addr> <port> " + "tcp|udp|icmp [vfr <table-id>] [del]", + .function = nat64_add_del_static_bib_command_fn, +}; + +/*? + * @cliexpar + * @cliexstart{show nat64 bib} + * Show NAT64 BIB entries. + * To show NAT64 TCP BIB entries use: + * vpp# show nat64 bib tcp + * NAT64 tcp BIB: + * fd01:1::2 6303 10.0.0.3 62303 tcp vrf 0 dynamic 1 sessions + * 2001:db8:c000:221:: 1234 10.1.1.3 5678 tcp vrf 0 static 2 sessions + * To show NAT64 UDP BIB entries use: + * vpp# show nat64 bib udp + * NAT64 udp BIB: + * fd01:1::2 6304 10.0.0.3 10546 udp vrf 0 dynamic 10 sessions + * 2001:db8:c000:221:: 1234 10.1.1.3 5678 udp vrf 10 static 0 sessions + * To show NAT64 ICMP BIB entries use: + * vpp# show nat64 bib icmp + * NAT64 icmp BIB: + * fd01:1::2 6305 10.0.0.3 63209 icmp vrf 10 dynamic 1 sessions + * @cliexend +?*/ +VLIB_CLI_COMMAND (show_nat64_bib_command, static) = { + .path = "show nat64 bib", + .short_help = "show nat64 bib all|tcp|udp|icmp|unknown", + .function = nat64_show_bib_command_fn, +}; + +/*? + * @cliexpar + * @cliexstart{set nat64 timeouts} + * Set NAT64 session timeouts (in seconds). + * To set NAT64 session timeoutes use use: + * vpp# set nat64 timeouts udp 200 icmp 30 tcp-trans 250 tcp-est 7450 + * To reset NAT64 session timeoutes to default values use: + * vpp# set nat64 timeouts reset + * @cliexend +?*/ +VLIB_CLI_COMMAND (set_nat64_timeouts_command, static) = { + .path = "set nat64 timeouts", + .short_help = "set nat64 timeouts udp <sec> icmp <sec> tcp-trans <sec> " + "tcp-est <sec> tcp-incoming-syn <sec> | reset", + .function = nat64_set_timeouts_command_fn, +}; + +/*? + * @cliexpar + * @cliexstart{show nat64 timeoutss} + * Show NAT64 session timeouts: + * vpp# show nat64 timeouts + * NAT64 session timeouts: + * UDP 300sec + * ICMP 60sec + * TCP transitory 240sec + * TCP established 7440sec + * TCP incoming SYN 6sec + * @cliexend +?*/ +VLIB_CLI_COMMAND (show_nat64_timeouts_command, static) = { + .path = "show nat64 timeouts", + .short_help = "show nat64 timeouts", + .function = nat64_show_timeouts_command_fn, +}; + +/*? + * @cliexpar + * @cliexstart{show nat64 session table} + * Show NAT64 session table. + * To show NAT64 TCP session table use: + * vpp# show nat64 session table tcp + * NAT64 tcp session table: + * fd01:1::2 6303 64:ff9b::ac10:202 20 10.0.0.3 62303 172.16.2.2 20 tcp vrf 0 + * fd01:3::2 6303 64:ff9b::ac10:202 20 10.0.10.3 21300 172.16.2.2 20 tcp vrf 10 + * To show NAT64 UDP session table use: + * #vpp show nat64 session table udp + * NAT64 udp session table: + * fd01:1::2 6304 64:ff9b::ac10:202 20 10.0.0.3 10546 172.16.2.2 20 udp vrf 0 + * fd01:3::2 6304 64:ff9b::ac10:202 20 10.0.10.3 58627 172.16.2.2 20 udp vrf 10 + * fd01:1::2 1235 64:ff9b::a00:3 4023 10.0.0.3 24488 10.0.0.3 4023 udp vrf 0 + * fd01:1::3 23 64:ff9b::a00:3 24488 10.0.0.3 4023 10.0.0.3 24488 udp vrf 0 + * To show NAT64 ICMP session table use: + * #vpp show nat64 session table icmp + * NAT64 icmp session table: + * fd01:1::2 64:ff9b::ac10:202 6305 10.0.0.3 172.16.2.2 63209 icmp vrf 0 + * @cliexend +?*/ +VLIB_CLI_COMMAND (show_nat64_st_command, static) = { + .path = "show nat64 session table", + .short_help = "show nat64 session table all|tcp|udp|icmp|unknown", + .function = nat64_show_st_command_fn, +}; + +/*? + * @cliexpar + * @cliexstart{nat64 add prefix} + * Set NAT64 prefix for generating IPv6 representations of IPv4 addresses. + * To set NAT64 global prefix use: + * vpp# nat64 add prefix 2001:db8::/32 + * To set NAT64 prefix for specific tenant use: + * vpp# nat64 add prefix 2001:db8:122:300::/56 tenant-vrf 10 + * @cliexend +?*/ +VLIB_CLI_COMMAND (nat64_add_del_prefix_command, static) = { + .path = "nat64 add prefix", + .short_help = "nat64 add prefix <ip6-prefix>/<plen> [tenant-vrf <vrf-id>] " + "[del]", + .function = nat64_add_del_prefix_command_fn, +}; + +/*? + * @cliexpar + * @cliexstart{show nat64 prefix} + * Show NAT64 prefix. + * To show NAT64 prefix use: + * vpp# show nat64 prefix + * NAT64 prefix: + * 2001:db8::/32 tenant-vrf 0 + * 2001:db8:122:300::/56 tenant-vrf 10 + * @cliexend +?*/ +VLIB_CLI_COMMAND (show_nat64_prefix_command, static) = { + .path = "show nat64 prefix", + .short_help = "show nat64 prefix", + .function = nat64_show_prefix_command_fn, +}; + +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/nat/nat64_db.c b/src/plugins/nat/nat64_db.c new file mode 100644 index 00000000..da73ceee --- /dev/null +++ b/src/plugins/nat/nat64_db.c @@ -0,0 +1,603 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @file + * @brief NAT64 DB + */ +#include <nat/nat64_db.h> + +int +nat64_db_init (nat64_db_t * db) +{ + u32 bib_buckets = 1024; + u32 bib_memory_size = 128 << 20; + u32 st_buckets = 2048; + u32 st_memory_size = 256 << 20; + + clib_bihash_init_24_8 (&db->bib.in2out, "bib-in2out", bib_buckets, + bib_memory_size); + + clib_bihash_init_24_8 (&db->bib.out2in, "bib-out2in", bib_buckets, + bib_memory_size); + + clib_bihash_init_48_8 (&db->st.in2out, "st-in2out", st_buckets, + st_memory_size); + + clib_bihash_init_48_8 (&db->st.out2in, "st-out2in", st_buckets, + st_memory_size); + + return 0; +} + +nat64_db_bib_entry_t * +nat64_db_bib_entry_create (nat64_db_t * db, ip6_address_t * in_addr, + ip4_address_t * out_addr, u16 in_port, + u16 out_port, u32 fib_index, u8 proto, + u8 is_static) +{ + nat64_db_bib_entry_t *bibe; + nat64_db_bib_entry_key_t bibe_key; + clib_bihash_kv_24_8_t kv; + + /* create pool entry */ + switch (ip_proto_to_snat_proto (proto)) + { +/* *INDENT-OFF* */ +#define _(N, i, n, s) \ + case SNAT_PROTOCOL_##N: \ + pool_get (db->bib._##n##_bib, bibe); \ + kv.value = bibe - db->bib._##n##_bib; \ + break; + foreach_snat_protocol +#undef _ +/* *INDENT-ON* */ + default: + pool_get (db->bib._unk_proto_bib, bibe); + kv.value = bibe - db->bib._unk_proto_bib; + break; + } + memset (bibe, 0, sizeof (*bibe)); + bibe->in_addr.as_u64[0] = in_addr->as_u64[0]; + bibe->in_addr.as_u64[1] = in_addr->as_u64[1]; + bibe->in_port = in_port; + bibe->out_addr.as_u32 = out_addr->as_u32; + bibe->out_port = out_port; + bibe->fib_index = fib_index; + bibe->proto = proto; + bibe->is_static = is_static; + + /* create hash lookup */ + bibe_key.addr.as_u64[0] = bibe->in_addr.as_u64[0]; + bibe_key.addr.as_u64[1] = bibe->in_addr.as_u64[1]; + bibe_key.fib_index = bibe->fib_index; + bibe_key.port = bibe->in_port; + bibe_key.proto = bibe->proto; + bibe_key.rsvd = 0; + kv.key[0] = bibe_key.as_u64[0]; + kv.key[1] = bibe_key.as_u64[1]; + kv.key[2] = bibe_key.as_u64[2]; + clib_bihash_add_del_24_8 (&db->bib.in2out, &kv, 1); + + memset (&bibe_key.addr, 0, sizeof (bibe_key.addr)); + bibe_key.addr.ip4.as_u32 = bibe->out_addr.as_u32; + bibe_key.fib_index = 0; + bibe_key.port = bibe->out_port; + kv.key[0] = bibe_key.as_u64[0]; + kv.key[1] = bibe_key.as_u64[1]; + kv.key[2] = bibe_key.as_u64[2]; + clib_bihash_add_del_24_8 (&db->bib.out2in, &kv, 1); + + return bibe; +} + +void +nat64_db_bib_entry_free (nat64_db_t * db, nat64_db_bib_entry_t * bibe) +{ + nat64_db_bib_entry_key_t bibe_key; + clib_bihash_kv_24_8_t kv; + nat64_db_bib_entry_t *bib; + u32 *ste_to_be_free = 0, *ste_index, bibe_index; + nat64_db_st_entry_t *st, *ste; + + switch (ip_proto_to_snat_proto (bibe->proto)) + { +/* *INDENT-OFF* */ +#define _(N, i, n, s) \ + case SNAT_PROTOCOL_##N: \ + bib = db->bib._##n##_bib; \ + st = db->st._##n##_st; \ + break; + foreach_snat_protocol +#undef _ +/* *INDENT-ON* */ + default: + bib = db->bib._unk_proto_bib; + st = db->st._unk_proto_st; + break; + } + + bibe_index = bibe - bib; + + /* delete ST entries for static BIB entry */ + if (bibe->is_static) + { + pool_foreach (ste, st, ( + { + if (ste->bibe_index == bibe_index) + vec_add1 (ste_to_be_free, ste - st);} + )); + vec_foreach (ste_index, ste_to_be_free) + nat64_db_st_entry_free (db, pool_elt_at_index (st, ste_index[0])); + vec_free (ste_to_be_free); + } + + /* delete hash lookup */ + bibe_key.addr.as_u64[0] = bibe->in_addr.as_u64[0]; + bibe_key.addr.as_u64[1] = bibe->in_addr.as_u64[1]; + bibe_key.fib_index = bibe->fib_index; + bibe_key.port = bibe->in_port; + bibe_key.proto = bibe->proto; + bibe_key.rsvd = 0; + kv.key[0] = bibe_key.as_u64[0]; + kv.key[1] = bibe_key.as_u64[1]; + kv.key[2] = bibe_key.as_u64[2]; + clib_bihash_add_del_24_8 (&db->bib.in2out, &kv, 0); + + memset (&bibe_key.addr, 0, sizeof (bibe_key.addr)); + bibe_key.addr.ip4.as_u32 = bibe->out_addr.as_u32; + bibe_key.fib_index = 0; + bibe_key.port = bibe->out_port; + kv.key[0] = bibe_key.as_u64[0]; + kv.key[1] = bibe_key.as_u64[1]; + kv.key[2] = bibe_key.as_u64[2]; + clib_bihash_add_del_24_8 (&db->bib.out2in, &kv, 0); + + /* delete from pool */ + pool_put (bib, bibe); + +} + +nat64_db_bib_entry_t * +nat64_db_bib_entry_find (nat64_db_t * db, ip46_address_t * addr, u16 port, + u8 proto, u32 fib_index, u8 is_ip6) +{ + nat64_db_bib_entry_t *bibe = 0; + nat64_db_bib_entry_key_t bibe_key; + clib_bihash_kv_24_8_t kv, value; + nat64_db_bib_entry_t *bib; + + switch (ip_proto_to_snat_proto (proto)) + { +/* *INDENT-OFF* */ +#define _(N, i, n, s) \ + case SNAT_PROTOCOL_##N: \ + bib = db->bib._##n##_bib; \ + break; + foreach_snat_protocol +#undef _ +/* *INDENT-ON* */ + default: + bib = db->bib._unk_proto_bib; + break; + } + + bibe_key.addr.as_u64[0] = addr->as_u64[0]; + bibe_key.addr.as_u64[1] = addr->as_u64[1]; + bibe_key.fib_index = fib_index; + bibe_key.port = port; + bibe_key.proto = proto; + bibe_key.rsvd = 0; + + kv.key[0] = bibe_key.as_u64[0]; + kv.key[1] = bibe_key.as_u64[1]; + kv.key[2] = bibe_key.as_u64[2]; + + if (!clib_bihash_search_24_8 + (is_ip6 ? &db->bib.in2out : &db->bib.out2in, &kv, &value)) + bibe = pool_elt_at_index (bib, value.value); + + return bibe; +} + +void +nat64_db_bib_walk (nat64_db_t * db, u8 proto, + nat64_db_bib_walk_fn_t fn, void *ctx) +{ + nat64_db_bib_entry_t *bib, *bibe; + + if (proto == 255) + { + /* *INDENT-OFF* */ + #define _(N, i, n, s) \ + bib = db->bib._##n##_bib; \ + pool_foreach (bibe, bib, ({ \ + if (fn (bibe, ctx)) \ + return; \ + })); + foreach_snat_protocol + #undef _ + bib = db->bib._unk_proto_bib; + pool_foreach (bibe, bib, ({ + if (fn (bibe, ctx)) + return; + })); + /* *INDENT-ON* */ + } + else + { + switch (ip_proto_to_snat_proto (proto)) + { + /* *INDENT-OFF* */ + #define _(N, i, n, s) \ + case SNAT_PROTOCOL_##N: \ + bib = db->bib._##n##_bib; \ + break; + foreach_snat_protocol + #undef _ + /* *INDENT-ON* */ + default: + bib = db->bib._unk_proto_bib; + break; + } + + /* *INDENT-OFF* */ + pool_foreach (bibe, bib, + ({ + if (fn (bibe, ctx)) + return; + })); + /* *INDENT-ON* */ + } +} + +nat64_db_bib_entry_t * +nat64_db_bib_entry_by_index (nat64_db_t * db, u8 proto, u32 bibe_index) +{ + nat64_db_bib_entry_t *bib; + + switch (ip_proto_to_snat_proto (proto)) + { +/* *INDENT-OFF* */ +#define _(N, i, n, s) \ + case SNAT_PROTOCOL_##N: \ + bib = db->bib._##n##_bib; \ + break; + foreach_snat_protocol +#undef _ +/* *INDENT-ON* */ + default: + bib = db->bib._unk_proto_bib; + break; + } + + return pool_elt_at_index (bib, bibe_index); +} + +void +nat64_db_st_walk (nat64_db_t * db, u8 proto, + nat64_db_st_walk_fn_t fn, void *ctx) +{ + nat64_db_st_entry_t *st, *ste; + + if (proto == 255) + { + /* *INDENT-OFF* */ + #define _(N, i, n, s) \ + st = db->st._##n##_st; \ + pool_foreach (ste, st, ({ \ + if (fn (ste, ctx)) \ + return; \ + })); + foreach_snat_protocol + #undef _ + st = db->st._unk_proto_st; + pool_foreach (ste, st, ({ + if (fn (ste, ctx)) + return; + })); + /* *INDENT-ON* */ + } + else + { + switch (ip_proto_to_snat_proto (proto)) + { + /* *INDENT-OFF* */ + #define _(N, i, n, s) \ + case SNAT_PROTOCOL_##N: \ + st = db->st._##n##_st; \ + break; + foreach_snat_protocol + #undef _ + /* *INDENT-ON* */ + default: + st = db->st._unk_proto_st; + break; + } + + /* *INDENT-OFF* */ + pool_foreach (ste, st, + ({ + if (fn (ste, ctx)) + return; + })); + /* *INDENT-ON* */ + } +} + +nat64_db_st_entry_t * +nat64_db_st_entry_create (nat64_db_t * db, nat64_db_bib_entry_t * bibe, + ip6_address_t * in_r_addr, + ip4_address_t * out_r_addr, u16 r_port) +{ + nat64_db_st_entry_t *ste; + nat64_db_bib_entry_t *bib; + nat64_db_st_entry_key_t ste_key; + clib_bihash_kv_48_8_t kv; + + /* create pool entry */ + switch (ip_proto_to_snat_proto (bibe->proto)) + { +/* *INDENT-OFF* */ +#define _(N, i, n, s) \ + case SNAT_PROTOCOL_##N: \ + pool_get (db->st._##n##_st, ste); \ + kv.value = ste - db->st._##n##_st; \ + bib = db->bib._##n##_bib; \ + break; + foreach_snat_protocol +#undef _ +/* *INDENT-ON* */ + default: + pool_get (db->st._unk_proto_st, ste); + kv.value = ste - db->st._unk_proto_st; + bib = db->bib._unk_proto_bib; + break; + } + memset (ste, 0, sizeof (*ste)); + ste->in_r_addr.as_u64[0] = in_r_addr->as_u64[0]; + ste->in_r_addr.as_u64[1] = in_r_addr->as_u64[1]; + ste->out_r_addr.as_u32 = out_r_addr->as_u32; + ste->r_port = r_port; + ste->bibe_index = bibe - bib; + ste->proto = bibe->proto; + + /* increment session number for BIB entry */ + bibe->ses_num++; + + /* create hash lookup */ + memset (&ste_key, 0, sizeof (ste_key)); + ste_key.l_addr.as_u64[0] = bibe->in_addr.as_u64[0]; + ste_key.l_addr.as_u64[1] = bibe->in_addr.as_u64[1]; + ste_key.r_addr.as_u64[0] = ste->in_r_addr.as_u64[0]; + ste_key.r_addr.as_u64[1] = ste->in_r_addr.as_u64[1]; + ste_key.fib_index = bibe->fib_index; + ste_key.l_port = bibe->in_port; + ste_key.r_port = ste->r_port; + ste_key.proto = ste->proto; + kv.key[0] = ste_key.as_u64[0]; + kv.key[1] = ste_key.as_u64[1]; + kv.key[2] = ste_key.as_u64[2]; + kv.key[3] = ste_key.as_u64[3]; + kv.key[4] = ste_key.as_u64[4]; + kv.key[5] = ste_key.as_u64[5]; + clib_bihash_add_del_48_8 (&db->st.in2out, &kv, 1); + + memset (&ste_key, 0, sizeof (ste_key)); + ste_key.l_addr.ip4.as_u32 = bibe->out_addr.as_u32; + ste_key.r_addr.ip4.as_u32 = ste->out_r_addr.as_u32; + ste_key.l_port = bibe->out_port; + ste_key.r_port = ste->r_port; + ste_key.proto = ste->proto; + kv.key[0] = ste_key.as_u64[0]; + kv.key[1] = ste_key.as_u64[1]; + kv.key[2] = ste_key.as_u64[2]; + kv.key[3] = ste_key.as_u64[3]; + kv.key[4] = ste_key.as_u64[4]; + kv.key[5] = ste_key.as_u64[5]; + clib_bihash_add_del_48_8 (&db->st.out2in, &kv, 1); + + return ste; +} + +void +nat64_db_st_entry_free (nat64_db_t * db, nat64_db_st_entry_t * ste) +{ + nat64_db_st_entry_t *st; + nat64_db_bib_entry_t *bib, *bibe; + nat64_db_st_entry_key_t ste_key; + clib_bihash_kv_48_8_t kv; + + switch (ip_proto_to_snat_proto (ste->proto)) + { +/* *INDENT-OFF* */ +#define _(N, i, n, s) \ + case SNAT_PROTOCOL_##N: \ + st = db->st._##n##_st; \ + bib = db->bib._##n##_bib; \ + break; + foreach_snat_protocol +#undef _ +/* *INDENT-ON* */ + default: + st = db->st._unk_proto_st; + bib = db->bib._unk_proto_bib; + break; + } + + bibe = pool_elt_at_index (bib, ste->bibe_index); + + /* delete hash lookup */ + memset (&ste_key, 0, sizeof (ste_key)); + ste_key.l_addr.as_u64[0] = bibe->in_addr.as_u64[0]; + ste_key.l_addr.as_u64[1] = bibe->in_addr.as_u64[1]; + ste_key.r_addr.as_u64[0] = ste->in_r_addr.as_u64[0]; + ste_key.r_addr.as_u64[1] = ste->in_r_addr.as_u64[1]; + ste_key.fib_index = bibe->fib_index; + ste_key.l_port = bibe->in_port; + ste_key.r_port = ste->r_port; + ste_key.proto = ste->proto; + kv.key[0] = ste_key.as_u64[0]; + kv.key[1] = ste_key.as_u64[1]; + kv.key[2] = ste_key.as_u64[2]; + kv.key[3] = ste_key.as_u64[3]; + kv.key[4] = ste_key.as_u64[4]; + kv.key[5] = ste_key.as_u64[5]; + clib_bihash_add_del_48_8 (&db->st.in2out, &kv, 0); + + memset (&ste_key, 0, sizeof (ste_key)); + ste_key.l_addr.ip4.as_u32 = bibe->out_addr.as_u32; + ste_key.r_addr.ip4.as_u32 = ste->out_r_addr.as_u32; + ste_key.l_port = bibe->out_port; + ste_key.r_port = ste->r_port; + ste_key.proto = ste->proto; + kv.key[0] = ste_key.as_u64[0]; + kv.key[1] = ste_key.as_u64[1]; + kv.key[2] = ste_key.as_u64[2]; + kv.key[3] = ste_key.as_u64[3]; + kv.key[4] = ste_key.as_u64[4]; + kv.key[5] = ste_key.as_u64[5]; + clib_bihash_add_del_48_8 (&db->st.out2in, &kv, 0); + + /* delete from pool */ + pool_put (st, ste); + + /* decrement session number for BIB entry */ + bibe->ses_num--; + + /* delete BIB entry if last session and dynamic */ + if (!bibe->is_static && !bibe->ses_num) + nat64_db_bib_entry_free (db, bibe); +} + +nat64_db_st_entry_t * +nat64_db_st_entry_find (nat64_db_t * db, ip46_address_t * l_addr, + ip46_address_t * r_addr, u16 l_port, u16 r_port, + u8 proto, u32 fib_index, u8 is_ip6) +{ + nat64_db_st_entry_t *ste = 0; + nat64_db_st_entry_t *st; + nat64_db_st_entry_key_t ste_key; + clib_bihash_kv_48_8_t kv, value; + + switch (ip_proto_to_snat_proto (proto)) + { +/* *INDENT-OFF* */ +#define _(N, i, n, s) \ + case SNAT_PROTOCOL_##N: \ + st = db->st._##n##_st; \ + break; + foreach_snat_protocol +#undef _ +/* *INDENT-ON* */ + default: + st = db->st._unk_proto_st; + break; + } + + memset (&ste_key, 0, sizeof (ste_key)); + ste_key.l_addr.as_u64[0] = l_addr->as_u64[0]; + ste_key.l_addr.as_u64[1] = l_addr->as_u64[1]; + ste_key.r_addr.as_u64[0] = r_addr->as_u64[0]; + ste_key.r_addr.as_u64[1] = r_addr->as_u64[1]; + ste_key.fib_index = fib_index; + ste_key.l_port = l_port; + ste_key.r_port = r_port; + ste_key.proto = proto; + kv.key[0] = ste_key.as_u64[0]; + kv.key[1] = ste_key.as_u64[1]; + kv.key[2] = ste_key.as_u64[2]; + kv.key[3] = ste_key.as_u64[3]; + kv.key[4] = ste_key.as_u64[4]; + kv.key[5] = ste_key.as_u64[5]; + + if (!clib_bihash_search_48_8 + (is_ip6 ? &db->st.in2out : &db->st.out2in, &kv, &value)) + ste = pool_elt_at_index (st, value.value); + + return ste; +} + +void +nad64_db_st_free_expired (nat64_db_t * db, u32 now) +{ + u32 *ste_to_be_free = 0, *ste_index; + nat64_db_st_entry_t *st, *ste; + +/* *INDENT-OFF* */ +#define _(N, i, n, s) \ + st = db->st._##n##_st; \ + pool_foreach (ste, st, ({\ + if (i == SNAT_PROTOCOL_TCP && !ste->tcp_state) \ + continue; \ + if (ste->expire < now) \ + vec_add1 (ste_to_be_free, ste - st); \ + })); \ + vec_foreach (ste_index, ste_to_be_free) \ + nat64_db_st_entry_free (db, pool_elt_at_index(st, ste_index[0])); \ + vec_free (ste_to_be_free); \ + ste_to_be_free = 0; + foreach_snat_protocol +#undef _ + st = db->st._unk_proto_st; + pool_foreach (ste, st, ({ + if (ste->expire < now) + vec_add1 (ste_to_be_free, ste - st); + })); + vec_foreach (ste_index, ste_to_be_free) + nat64_db_st_entry_free (db, pool_elt_at_index(st, ste_index[0])); + vec_free (ste_to_be_free); +/* *INDENT-ON* */ +} + +void +nat64_db_free_out_addr (nat64_db_t * db, ip4_address_t * out_addr) +{ + u32 *ste_to_be_free = 0, *ste_index; + nat64_db_st_entry_t *st, *ste; + nat64_db_bib_entry_t *bibe; + +/* *INDENT-OFF* */ +#define _(N, i, n, s) \ + st = db->st._##n##_st; \ + pool_foreach (ste, st, ({ \ + bibe = pool_elt_at_index (db->bib._##n##_bib, ste->bibe_index); \ + if (bibe->out_addr.as_u32 == out_addr->as_u32) \ + vec_add1 (ste_to_be_free, ste - st); \ + })); \ + vec_foreach (ste_index, ste_to_be_free) \ + nat64_db_st_entry_free (db, pool_elt_at_index(st, ste_index[0])); \ + vec_free (ste_to_be_free); \ + ste_to_be_free = 0; + foreach_snat_protocol +#undef _ + st = db->st._unk_proto_st; + pool_foreach (ste, st, ({ + bibe = pool_elt_at_index (db->bib._unk_proto_bib, ste->bibe_index); + if (bibe->out_addr.as_u32 == out_addr->as_u32) + vec_add1 (ste_to_be_free, ste - st); + })); + vec_foreach (ste_index, ste_to_be_free) + nat64_db_st_entry_free (db, pool_elt_at_index(st, ste_index[0])); + vec_free (ste_to_be_free); +/* *INDENT-ON* */ +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/nat/nat64_db.h b/src/plugins/nat/nat64_db.h new file mode 100644 index 00000000..394ca875 --- /dev/null +++ b/src/plugins/nat/nat64_db.h @@ -0,0 +1,307 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @file + * @brief NAT64 DB + */ +#ifndef __included_nat64_db_h__ +#define __included_nat64_db_h__ + +#include <vppinfra/bihash_24_8.h> +#include <vppinfra/bihash_48_8.h> +#include <nat/nat.h> + + +typedef struct +{ + union + { + struct + { + ip46_address_t addr; + u32 fib_index; + u16 port; + u8 proto; + u8 rsvd; + }; + u64 as_u64[3]; + }; +} nat64_db_bib_entry_key_t; + +/* *INDENT-OFF* */ +typedef CLIB_PACKED(struct +{ + ip6_address_t in_addr; + u16 in_port; + ip4_address_t out_addr; + u16 out_port; + u32 fib_index; + u32 ses_num; + u8 proto; + u8 is_static; +}) nat64_db_bib_entry_t; +/* *INDENT-ON* */ + +typedef struct +{ + /* BIBs */ +/* *INDENT-OFF* */ +#define _(N, i, n, s) \ + nat64_db_bib_entry_t *_##n##_bib; + foreach_snat_protocol +#undef _ +/* *INDENT-ON* */ + nat64_db_bib_entry_t *_unk_proto_bib; + + /* BIB lookup */ + clib_bihash_24_8_t in2out; + clib_bihash_24_8_t out2in; +} nat64_db_bib_t; + +typedef struct +{ + union + { + struct + { + ip46_address_t l_addr; + ip46_address_t r_addr; + u32 fib_index; + u16 l_port; + u16 r_port; + u8 proto; + u8 rsvd[7]; + }; + u64 as_u64[6]; + }; +} nat64_db_st_entry_key_t; + +/* *INDENT-OFF* */ +typedef CLIB_PACKED(struct +{ + ip6_address_t in_r_addr; + ip4_address_t out_r_addr; + u16 r_port; + u32 bibe_index; + u32 expire; + u8 proto; + u8 tcp_state; +}) nat64_db_st_entry_t; +/* *INDENT-ON* */ + +typedef struct +{ + /* session tables */ +/* *INDENT-OFF* */ +#define _(N, i, n, s) \ + nat64_db_st_entry_t *_##n##_st; + foreach_snat_protocol +#undef _ +/* *INDENT-ON* */ + nat64_db_st_entry_t *_unk_proto_st; + + /* session lookup */ + clib_bihash_48_8_t in2out; + clib_bihash_48_8_t out2in; +} nat64_db_st_t; + +typedef struct +{ + nat64_db_bib_t bib; + nat64_db_st_t st; +} nat64_db_t; + +/** + * @brief Initialize NAT64 DB. + * + * @param db NAT64 DB. + * + * @returns 0 on success, non-zero value otherwise. + */ +int nat64_db_init (nat64_db_t * db); + +/** + * @brief Create new NAT64 BIB entry. + * + * @param db NAT64 DB. + * @param in_addr Inside IPv6 address. + * @param out_addr Outside IPv4 address. + * @param in_port Inside port number. + * @param out_port Outside port number. + * @param fib_index FIB index. + * @param proto L4 protocol. + * @param is_static 1 if static, 0 if dynamic. + * + * @returns BIB entry on success, 0 otherwise. + */ +nat64_db_bib_entry_t *nat64_db_bib_entry_create (nat64_db_t * db, + ip6_address_t * in_addr, + ip4_address_t * out_addr, + u16 in_port, u16 out_port, + u32 fib_index, + u8 proto, u8 is_static); + +/** + * @brief Free NAT64 BIB entry. + * + * @param db NAT64 DB. + * @param bibe BIB entry. + */ +void nat64_db_bib_entry_free (nat64_db_t * db, nat64_db_bib_entry_t * bibe); + +/** + * @brief Call back function when walking NAT64 BIB, non-zero + * return value stop walk. + */ +typedef int (*nat64_db_bib_walk_fn_t) (nat64_db_bib_entry_t * bibe, + void *ctx); +/** + * @brief Walk NAT64 BIB. + * + * @param db NAT64 DB. + * @param proto BIB L4 protocol: + * - 255 all BIBs + * - 6 TCP BIB + * - 17 UDP BIB + * - 1/58 ICMP BIB + * - otherwise "unknown" protocol BIB + * @param fn The function to invoke on each entry visited. + * @param ctx A context passed in the visit function. + */ +void nat64_db_bib_walk (nat64_db_t * db, u8 proto, + nat64_db_bib_walk_fn_t fn, void *ctx); + +/** + * @brief Find NAT64 BIB entry. + * + * @param db NAT64 DB. + * @param addr IP address. + * @param port Port number. + * @param proto L4 protocol. + * @param fib_index FIB index. + * @param is_ip6 1 if find by IPv6 (inside) address, 0 by IPv4 (outside). + * + * @return BIB entry if found. + */ +nat64_db_bib_entry_t *nat64_db_bib_entry_find (nat64_db_t * db, + ip46_address_t * addr, + u16 port, + u8 proto, + u32 fib_index, u8 is_ip6); + +/** + * @brief Get BIB entry by index and protocol. + * + * @param db NAT64 DB. + * @param proto L4 protocol. + * @param bibe_index BIB entry index. + * + * @return BIB entry if found. + */ +nat64_db_bib_entry_t *nat64_db_bib_entry_by_index (nat64_db_t * db, + u8 proto, u32 bibe_index); +/** + * @brief Create new NAT64 session table entry. + * + * @param db NAT64 DB. + * @param bibe Corresponding BIB entry. + * @param in_r_addr Inside IPv6 address of the remote host. + * @param out_r_addr Outside IPv4 address of the remote host. + * @param r_port Remote host port number. + * + * @returns BIB entry on success, 0 otherwise. + */ +nat64_db_st_entry_t *nat64_db_st_entry_create (nat64_db_t * db, + nat64_db_bib_entry_t * bibe, + ip6_address_t * in_r_addr, + ip4_address_t * out_r_addr, + u16 r_port); + +/** + * @brief Free NAT64 session table entry. + * + * @param db NAT64 DB. + * @param ste Session table entry. + */ +void nat64_db_st_entry_free (nat64_db_t * db, nat64_db_st_entry_t * ste); + +/** + * @brief Find NAT64 session table entry. + * + * @param db NAT64 DB. + * @param l_addr Local host address. + * @param r_addr Remote host address. + * @param l_port Local host port number. + * @param r_port Remote host port number. + * @param proto L4 protocol. + * @param fib_index FIB index. + * @param is_ip6 1 if find by IPv6 (inside) address, 0 by IPv4 (outside). + * + * @return BIB entry if found. + */ +nat64_db_st_entry_t *nat64_db_st_entry_find (nat64_db_t * db, + ip46_address_t * l_addr, + ip46_address_t * r_addr, + u16 l_port, u16 r_port, + u8 proto, + u32 fib_index, u8 is_ip6); + +/** + * @brief Call back function when walking NAT64 session table, non-zero + * return value stop walk. + */ +typedef int (*nat64_db_st_walk_fn_t) (nat64_db_st_entry_t * ste, void *ctx); + +/** + * @brief Walk NAT64 session table. + * + * @param db NAT64 DB. + * @param proto L4 protocol: + * - 255 all session tables + * - 6 TCP session table + * - 17 UDP session table + * - 1/58 ICMP session table + * - otherwise "unknown" protocol session table + * @param fn The function to invoke on each entry visited. + * @param ctx A context passed in the visit function. + */ +void nat64_db_st_walk (nat64_db_t * db, u8 proto, + nat64_db_st_walk_fn_t fn, void *ctx); + +/** + * @brief Free expired session entries in session tables. + * + * @param db NAT64 DB. + * @param now Current time. + */ +void nad64_db_st_free_expired (nat64_db_t * db, u32 now); + +/** + * @brief Free sessions using specific outside address. + * + * @param db NAT64 DB. + * @param out_addr Outside address to match. + */ +void nat64_db_free_out_addr (nat64_db_t * db, ip4_address_t * out_addr); + +#endif /* __included_nat64_db_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/nat/nat64_doc.md b/src/plugins/nat/nat64_doc.md new file mode 100644 index 00000000..f65b4633 --- /dev/null +++ b/src/plugins/nat/nat64_doc.md @@ -0,0 +1,73 @@ +# Stateful NAT64: Network Address and Protocol Translation from IPv6 Clients to IPv4 Servers {#nat64_doc} + +## Introduction + +Stateful NAT64 in VPP allows IPv6-only clients to contact IPv4 servers using unicast UDP, TCP, or ICMP based on RFC 6146. + +## Configuration + +### Enable/disable NAT64 feature on the interface + +> set interface nat64 in|out <intfc> [del] + +in: inside/local/IPv6 network +out: outside/external/IPv4 network +intfc: interface name + +### Add/delete NAT64 pool address + +One or more public IPv4 addresses assigned to a NAT64 are shared among several IPv6-only clients. + +> nat64 add pool address <ip4-range-start> [- <ip4-range-end>] [tenant-vrf <tenant-vrf-id>] [del] + +ip4-range-start: First IPv4 address of the range +ip4-range-end: Last IPv4 address of the range (optional, not used for single address) +tenant-vrf-id: VRF id of the tenant associated with the pool address (optional, if not set pool address is global) + +### Add/delete static BIB entry + +Stateful NAT64 also supports IPv4-initiated communications to a subset of the IPv6 hosts through staticaly configured bindings. + +> nat64 add static bib <ip6-addr> <in-port> <ip4-addr> <out-port> tcp|udp|icmp [vfr <table-id>] [del] + +ip6-addr: inside IPv6 address of the host +in-port: inside port or ICMPv6 identifier +ip4-addr: outside IPv4 address of the host +out-port: outside port or ICMPv4 identifier +table-id: VRF id of the tenant associated with the BIB entry (optional, default use global VRF) + +### Set NAT64 session timeouts + +Session is deleted when timer expires. If all sessions corresponding to a dynamically create BIB entry are deleted, then the BIB entry is also deleted. When packets are flowing sessiom timer is refreshed to keep the session alive. + +> set nat64 timeouts udp <sec> icmp <sec> tcp-trans <sec> tcp-est <sec> tcp-incoming-syn <sec> | reset + +udp: UDP session timeout value (default 300sec) +icmp: ICMP session timeout value (default 60sec) +tcp-trans: transitory TCP session timeout value (default 240sec) +tcp-est: established TCP session timeout value (default 7440sec) +tcp-incoming-syn: incoming SYN TCP session timeout value (default 6sec) +reset: reset timers to default values + +### Set NAT64 prefix + +Stateful NAT64 support the algorithm for generating IPv6 representations of IPv4 addresses defined in RFC 6052. If no prefix is configured, Well-Known Prefix (64:ff9b::/96) is used. + +> nat64 add prefix <ip6-prefix>/<plen> [tenant-vrf <vrf-id>] [del] + +ip6-prefix: IPv6 prefix +plen: prefix length (valid values: 32, 40, 48, 56, 64, or 96) +tenant-vrf: VRF id of the tenant associated with the prefix + +### Show commands + +> show nat64 pool +> show nat64 interfaces +> show nat64 bib tcp|udp|icmp +> show nat64 session table tcp|udp|icmp +> show nat64 timeouts +> show nat64 prefix + +## Notes + +Multi thread is not supported yet (CLI/API commands are disabled when VPP runs with multiple threads). diff --git a/src/plugins/nat/nat64_in2out.c b/src/plugins/nat/nat64_in2out.c new file mode 100644 index 00000000..f78baff4 --- /dev/null +++ b/src/plugins/nat/nat64_in2out.c @@ -0,0 +1,1118 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @file + * @brief NAT64 IPv6 to IPv4 translation (inside to outside network) + */ + +#include <nat/nat64.h> +#include <vnet/ip/ip6_to_ip4.h> +#include <vnet/fib/fib_table.h> + +typedef struct +{ + u32 sw_if_index; + u32 next_index; + u8 is_slow_path; +} nat64_in2out_trace_t; + +static u8 * +format_nat64_in2out_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + nat64_in2out_trace_t *t = va_arg (*args, nat64_in2out_trace_t *); + char *tag; + + tag = t->is_slow_path ? "NAT64-in2out-slowpath" : "NAT64-in2out"; + + s = + format (s, "%s: sw_if_index %d, next index %d", tag, t->sw_if_index, + t->next_index); + + return s; +} + +vlib_node_registration_t nat64_in2out_node; +vlib_node_registration_t nat64_in2out_slowpath_node; + +#define foreach_nat64_in2out_error \ +_(UNSUPPORTED_PROTOCOL, "unsupported protocol") \ +_(IN2OUT_PACKETS, "good in2out packets processed") \ +_(NO_TRANSLATION, "no translation") \ +_(UNKNOWN, "unknown") + +typedef enum +{ +#define _(sym,str) NAT64_IN2OUT_ERROR_##sym, + foreach_nat64_in2out_error +#undef _ + NAT64_IN2OUT_N_ERROR, +} nat64_in2out_error_t; + +static char *nat64_in2out_error_strings[] = { +#define _(sym,string) string, + foreach_nat64_in2out_error +#undef _ +}; + +typedef enum +{ + NAT64_IN2OUT_NEXT_IP4_LOOKUP, + NAT64_IN2OUT_NEXT_IP6_LOOKUP, + NAT64_IN2OUT_NEXT_DROP, + NAT64_IN2OUT_NEXT_SLOWPATH, + NAT64_IN2OUT_N_NEXT, +} nat64_in2out_next_t; + +typedef struct nat64_in2out_set_ctx_t_ +{ + vlib_buffer_t *b; + vlib_main_t *vm; +} nat64_in2out_set_ctx_t; + +/** + * @brief Check whether is a hairpinning. + * + * If the destination IP address of the packet is an IPv4 address assigned to + * the NAT64 itself, then the packet is a hairpin packet. + * + * param dst_addr Destination address of the packet. + * + * @returns 1 if hairpinning, otherwise 0. + */ +static_always_inline int +is_hairpinning (ip6_address_t * dst_addr) +{ + nat64_main_t *nm = &nat64_main; + int i; + + for (i = 0; i < vec_len (nm->addr_pool); i++) + { + if (nm->addr_pool[i].addr.as_u32 == dst_addr->as_u32[3]) + return 1; + } + + return 0; +} + +static int +nat64_in2out_tcp_udp_set_cb (ip6_header_t * ip6, ip4_header_t * ip4, + void *arg) +{ + nat64_main_t *nm = &nat64_main; + nat64_in2out_set_ctx_t *ctx = arg; + nat64_db_bib_entry_t *bibe; + nat64_db_st_entry_t *ste; + ip46_address_t saddr, daddr; + u32 sw_if_index, fib_index; + udp_header_t *udp = ip6_next_header (ip6); + u8 proto = ip6->protocol; + u16 sport = udp->src_port; + u16 dport = udp->dst_port; + + sw_if_index = vnet_buffer (ctx->b)->sw_if_index[VLIB_RX]; + fib_index = + fib_table_get_index_for_sw_if_index (FIB_PROTOCOL_IP6, sw_if_index); + + saddr.as_u64[0] = ip6->src_address.as_u64[0]; + saddr.as_u64[1] = ip6->src_address.as_u64[1]; + daddr.as_u64[0] = ip6->dst_address.as_u64[0]; + daddr.as_u64[1] = ip6->dst_address.as_u64[1]; + + ste = + nat64_db_st_entry_find (&nm->db, &saddr, &daddr, sport, dport, proto, + fib_index, 1); + + if (ste) + { + bibe = nat64_db_bib_entry_by_index (&nm->db, proto, ste->bibe_index); + if (!bibe) + return -1; + } + else + { + bibe = + nat64_db_bib_entry_find (&nm->db, &saddr, sport, proto, fib_index, 1); + + if (!bibe) + { + u16 out_port; + ip4_address_t out_addr; + if (nat64_alloc_out_addr_and_port + (fib_index, ip_proto_to_snat_proto (proto), &out_addr, + &out_port)) + return -1; + + bibe = + nat64_db_bib_entry_create (&nm->db, &ip6->src_address, &out_addr, + sport, clib_host_to_net_u16 (out_port), + fib_index, proto, 0); + if (!bibe) + return -1; + } + + nat64_extract_ip4 (&ip6->dst_address, &daddr.ip4, fib_index); + ste = + nat64_db_st_entry_create (&nm->db, bibe, &ip6->dst_address, + &daddr.ip4, dport); + if (!ste) + return -1; + } + + nat64_session_reset_timeout (ste, ctx->vm); + + ip4->src_address.as_u32 = bibe->out_addr.as_u32; + udp->src_port = bibe->out_port; + + ip4->dst_address.as_u32 = ste->out_r_addr.as_u32; + + if (proto == IP_PROTOCOL_TCP) + { + u16 *checksum; + ip_csum_t csum; + tcp_header_t *tcp = ip6_next_header (ip6); + + checksum = &tcp->checksum; + csum = ip_csum_sub_even (*checksum, sport); + csum = ip_csum_add_even (csum, udp->src_port); + *checksum = ip_csum_fold (csum); + } + + return 0; +} + +static int +nat64_in2out_icmp_set_cb (ip6_header_t * ip6, ip4_header_t * ip4, void *arg) +{ + nat64_main_t *nm = &nat64_main; + nat64_in2out_set_ctx_t *ctx = arg; + nat64_db_bib_entry_t *bibe; + nat64_db_st_entry_t *ste; + ip46_address_t saddr, daddr; + u32 sw_if_index, fib_index; + icmp46_header_t *icmp = ip6_next_header (ip6); + + sw_if_index = vnet_buffer (ctx->b)->sw_if_index[VLIB_RX]; + fib_index = + fib_table_get_index_for_sw_if_index (FIB_PROTOCOL_IP6, sw_if_index); + + saddr.as_u64[0] = ip6->src_address.as_u64[0]; + saddr.as_u64[1] = ip6->src_address.as_u64[1]; + daddr.as_u64[0] = ip6->dst_address.as_u64[0]; + daddr.as_u64[1] = ip6->dst_address.as_u64[1]; + + if (icmp->type == ICMP4_echo_request || icmp->type == ICMP4_echo_reply) + { + u16 in_id = ((u16 *) (icmp))[2]; + ste = + nat64_db_st_entry_find (&nm->db, &saddr, &daddr, in_id, 0, + IP_PROTOCOL_ICMP, fib_index, 1); + + if (ste) + { + bibe = + nat64_db_bib_entry_by_index (&nm->db, IP_PROTOCOL_ICMP, + ste->bibe_index); + if (!bibe) + return -1; + } + else + { + bibe = + nat64_db_bib_entry_find (&nm->db, &saddr, in_id, + IP_PROTOCOL_ICMP, fib_index, 1); + + if (!bibe) + { + u16 out_id; + ip4_address_t out_addr; + if (nat64_alloc_out_addr_and_port + (fib_index, SNAT_PROTOCOL_ICMP, &out_addr, &out_id)) + return -1; + + bibe = + nat64_db_bib_entry_create (&nm->db, &ip6->src_address, + &out_addr, in_id, + clib_host_to_net_u16 (out_id), + fib_index, IP_PROTOCOL_ICMP, 0); + if (!bibe) + return -1; + } + + nat64_extract_ip4 (&ip6->dst_address, &daddr.ip4, fib_index); + ste = + nat64_db_st_entry_create (&nm->db, bibe, &ip6->dst_address, + &daddr.ip4, 0); + if (!ste) + return -1; + } + + nat64_session_reset_timeout (ste, ctx->vm); + + ip4->src_address.as_u32 = bibe->out_addr.as_u32; + ((u16 *) (icmp))[2] = bibe->out_port; + + ip4->dst_address.as_u32 = ste->out_r_addr.as_u32; + } + else + { + if (!vec_len (nm->addr_pool)) + return -1; + + ip4->src_address.as_u32 = nm->addr_pool[0].addr.as_u32; + nat64_extract_ip4 (&ip6->dst_address, &ip4->dst_address, fib_index); + } + + return 0; +} + +static int +nat64_in2out_inner_icmp_set_cb (ip6_header_t * ip6, ip4_header_t * ip4, + void *arg) +{ + nat64_main_t *nm = &nat64_main; + nat64_in2out_set_ctx_t *ctx = arg; + nat64_db_st_entry_t *ste; + nat64_db_bib_entry_t *bibe; + ip46_address_t saddr, daddr; + u32 sw_if_index, fib_index; + u8 proto = ip6->protocol; + + sw_if_index = vnet_buffer (ctx->b)->sw_if_index[VLIB_RX]; + fib_index = + fib_table_get_index_for_sw_if_index (FIB_PROTOCOL_IP6, sw_if_index); + + saddr.as_u64[0] = ip6->src_address.as_u64[0]; + saddr.as_u64[1] = ip6->src_address.as_u64[1]; + daddr.as_u64[0] = ip6->dst_address.as_u64[0]; + daddr.as_u64[1] = ip6->dst_address.as_u64[1]; + + if (proto == IP_PROTOCOL_ICMP6) + { + icmp46_header_t *icmp = ip6_next_header (ip6); + u16 in_id = ((u16 *) (icmp))[2]; + proto = IP_PROTOCOL_ICMP; + + if (! + (icmp->type == ICMP4_echo_request + || icmp->type == ICMP4_echo_reply)) + return -1; + + ste = + nat64_db_st_entry_find (&nm->db, &daddr, &saddr, in_id, 0, proto, + fib_index, 1); + if (!ste) + return -1; + + bibe = nat64_db_bib_entry_by_index (&nm->db, proto, ste->bibe_index); + if (!bibe) + return -1; + + ip4->dst_address.as_u32 = bibe->out_addr.as_u32; + ((u16 *) (icmp))[2] = bibe->out_port; + ip4->src_address.as_u32 = ste->out_r_addr.as_u32; + } + else + { + udp_header_t *udp = ip6_next_header (ip6); + tcp_header_t *tcp = ip6_next_header (ip6); + u16 *checksum; + ip_csum_t csum; + + u16 sport = udp->src_port; + u16 dport = udp->dst_port; + + ste = + nat64_db_st_entry_find (&nm->db, &daddr, &saddr, dport, sport, proto, + fib_index, 1); + if (!ste) + return -1; + + bibe = nat64_db_bib_entry_by_index (&nm->db, proto, ste->bibe_index); + if (!bibe) + return -1; + + ip4->dst_address.as_u32 = bibe->out_addr.as_u32; + udp->dst_port = bibe->out_port; + ip4->src_address.as_u32 = ste->out_r_addr.as_u32; + + if (proto == IP_PROTOCOL_TCP) + checksum = &tcp->checksum; + else + checksum = &udp->checksum; + csum = ip_csum_sub_even (*checksum, dport); + csum = ip_csum_add_even (csum, udp->dst_port); + *checksum = ip_csum_fold (csum); + } + + return 0; +} + +typedef struct unk_proto_st_walk_ctx_t_ +{ + ip6_address_t src_addr; + ip6_address_t dst_addr; + ip4_address_t out_addr; + u32 fib_index; + u8 proto; +} unk_proto_st_walk_ctx_t; + +static int +unk_proto_st_walk (nat64_db_st_entry_t * ste, void *arg) +{ + nat64_main_t *nm = &nat64_main; + unk_proto_st_walk_ctx_t *ctx = arg; + nat64_db_bib_entry_t *bibe; + ip46_address_t saddr, daddr; + + if (ip46_address_is_equal (&ste->in_r_addr, &ctx->dst_addr)) + { + bibe = + nat64_db_bib_entry_by_index (&nm->db, ste->proto, ste->bibe_index); + if (!bibe) + return -1; + + if (ip46_address_is_equal (&bibe->in_addr, &ctx->src_addr) + && bibe->fib_index == ctx->fib_index) + { + memset (&saddr, 0, sizeof (saddr)); + saddr.ip4.as_u32 = bibe->out_addr.as_u32; + memset (&daddr, 0, sizeof (daddr)); + nat64_extract_ip4 (&ctx->dst_addr, &daddr.ip4, ctx->fib_index); + + if (nat64_db_st_entry_find + (&nm->db, &daddr, &saddr, 0, 0, ctx->proto, ctx->fib_index, 0)) + return -1; + + ctx->out_addr.as_u32 = bibe->out_addr.as_u32; + return 1; + } + } + + return 0; +} + +static int +nat64_in2out_unk_proto_set_cb (ip6_header_t * ip6, ip4_header_t * ip4, + void *arg) +{ + nat64_main_t *nm = &nat64_main; + nat64_in2out_set_ctx_t *ctx = arg; + nat64_db_bib_entry_t *bibe; + nat64_db_st_entry_t *ste; + ip46_address_t saddr, daddr, addr; + u32 sw_if_index, fib_index; + u8 proto = ip6->protocol; + int i; + + sw_if_index = vnet_buffer (ctx->b)->sw_if_index[VLIB_RX]; + fib_index = + fib_table_get_index_for_sw_if_index (FIB_PROTOCOL_IP6, sw_if_index); + + saddr.as_u64[0] = ip6->src_address.as_u64[0]; + saddr.as_u64[1] = ip6->src_address.as_u64[1]; + daddr.as_u64[0] = ip6->dst_address.as_u64[0]; + daddr.as_u64[1] = ip6->dst_address.as_u64[1]; + + ste = + nat64_db_st_entry_find (&nm->db, &saddr, &daddr, 0, 0, proto, fib_index, + 1); + + if (ste) + { + bibe = nat64_db_bib_entry_by_index (&nm->db, proto, ste->bibe_index); + if (!bibe) + return -1; + } + else + { + bibe = + nat64_db_bib_entry_find (&nm->db, &saddr, 0, proto, fib_index, 1); + + if (!bibe) + { + /* Choose same out address as for TCP/UDP session to same dst */ + unk_proto_st_walk_ctx_t ctx = { + .src_addr.as_u64[0] = ip6->src_address.as_u64[0], + .src_addr.as_u64[1] = ip6->src_address.as_u64[1], + .dst_addr.as_u64[0] = ip6->dst_address.as_u64[0], + .dst_addr.as_u64[1] = ip6->dst_address.as_u64[1], + .out_addr.as_u32 = 0, + .fib_index = fib_index, + .proto = proto, + }; + + nat64_db_st_walk (&nm->db, IP_PROTOCOL_TCP, unk_proto_st_walk, + &ctx); + + if (!ctx.out_addr.as_u32) + nat64_db_st_walk (&nm->db, IP_PROTOCOL_UDP, unk_proto_st_walk, + &ctx); + + /* Verify if out address is not already in use for protocol */ + memset (&addr, 0, sizeof (addr)); + addr.ip4.as_u32 = ctx.out_addr.as_u32; + if (nat64_db_bib_entry_find (&nm->db, &addr, 0, proto, 0, 0)) + ctx.out_addr.as_u32 = 0; + + if (!ctx.out_addr.as_u32) + { + for (i = 0; i < vec_len (nm->addr_pool); i++) + { + addr.ip4.as_u32 = nm->addr_pool[i].addr.as_u32; + if (!nat64_db_bib_entry_find + (&nm->db, &addr, 0, proto, 0, 0)) + break; + } + } + + if (!ctx.out_addr.as_u32) + return -1; + + bibe = + nat64_db_bib_entry_create (&nm->db, &ip6->src_address, + &ctx.out_addr, 0, 0, fib_index, proto, + 0); + if (!bibe) + return -1; + } + + nat64_extract_ip4 (&ip6->dst_address, &daddr.ip4, fib_index); + ste = + nat64_db_st_entry_create (&nm->db, bibe, &ip6->dst_address, + &daddr.ip4, 0); + if (!ste) + return -1; + } + + nat64_session_reset_timeout (ste, ctx->vm); + + ip4->src_address.as_u32 = bibe->out_addr.as_u32; + ip4->dst_address.as_u32 = ste->out_r_addr.as_u32; + + return 0; +} + + + +static int +nat64_in2out_tcp_udp_hairpinning (vlib_main_t * vm, vlib_buffer_t * b, + ip6_header_t * ip6) +{ + nat64_main_t *nm = &nat64_main; + nat64_db_bib_entry_t *bibe; + nat64_db_st_entry_t *ste; + ip46_address_t saddr, daddr; + u32 sw_if_index, fib_index; + udp_header_t *udp = ip6_next_header (ip6); + tcp_header_t *tcp = ip6_next_header (ip6); + u8 proto = ip6->protocol; + u16 sport = udp->src_port; + u16 dport = udp->dst_port; + u16 *checksum; + ip_csum_t csum; + + sw_if_index = vnet_buffer (b)->sw_if_index[VLIB_RX]; + fib_index = + fib_table_get_index_for_sw_if_index (FIB_PROTOCOL_IP6, sw_if_index); + + saddr.as_u64[0] = ip6->src_address.as_u64[0]; + saddr.as_u64[1] = ip6->src_address.as_u64[1]; + daddr.as_u64[0] = ip6->dst_address.as_u64[0]; + daddr.as_u64[1] = ip6->dst_address.as_u64[1]; + + if (proto == IP_PROTOCOL_UDP) + checksum = &udp->checksum; + else + checksum = &tcp->checksum; + + csum = ip_csum_sub_even (*checksum, ip6->src_address.as_u64[0]); + csum = ip_csum_sub_even (csum, ip6->src_address.as_u64[1]); + csum = ip_csum_sub_even (csum, ip6->dst_address.as_u64[0]); + csum = ip_csum_sub_even (csum, ip6->dst_address.as_u64[1]); + csum = ip_csum_sub_even (csum, sport); + csum = ip_csum_sub_even (csum, dport); + + ste = + nat64_db_st_entry_find (&nm->db, &saddr, &daddr, sport, dport, proto, + fib_index, 1); + + if (ste) + { + bibe = nat64_db_bib_entry_by_index (&nm->db, proto, ste->bibe_index); + if (!bibe) + return -1; + } + else + { + bibe = + nat64_db_bib_entry_find (&nm->db, &saddr, sport, proto, fib_index, 1); + + if (!bibe) + { + u16 out_port; + ip4_address_t out_addr; + if (nat64_alloc_out_addr_and_port + (fib_index, ip_proto_to_snat_proto (proto), &out_addr, + &out_port)) + return -1; + + bibe = + nat64_db_bib_entry_create (&nm->db, &ip6->src_address, &out_addr, + sport, clib_host_to_net_u16 (out_port), + fib_index, proto, 0); + if (!bibe) + return -1; + } + + nat64_extract_ip4 (&ip6->dst_address, &daddr.ip4, fib_index); + ste = + nat64_db_st_entry_create (&nm->db, bibe, &ip6->dst_address, + &daddr.ip4, dport); + if (!ste) + return -1; + } + + nat64_session_reset_timeout (ste, vm); + + sport = udp->src_port = bibe->out_port; + nat64_compose_ip6 (&ip6->src_address, &bibe->out_addr, fib_index); + + memset (&saddr, 0, sizeof (saddr)); + memset (&daddr, 0, sizeof (daddr)); + saddr.ip4.as_u32 = bibe->out_addr.as_u32; + daddr.ip4.as_u32 = ste->out_r_addr.as_u32; + + ste = + nat64_db_st_entry_find (&nm->db, &daddr, &saddr, dport, sport, proto, 0, + 0); + + if (ste) + { + bibe = nat64_db_bib_entry_by_index (&nm->db, proto, ste->bibe_index); + if (!bibe) + return -1; + } + else + { + bibe = nat64_db_bib_entry_find (&nm->db, &daddr, dport, proto, 0, 0); + + if (!bibe) + return -1; + + ste = + nat64_db_st_entry_create (&nm->db, bibe, &ip6->src_address, + &saddr.ip4, sport); + } + + ip6->dst_address.as_u64[0] = bibe->in_addr.as_u64[0]; + ip6->dst_address.as_u64[1] = bibe->in_addr.as_u64[1]; + udp->dst_port = bibe->in_port; + + csum = ip_csum_add_even (csum, ip6->src_address.as_u64[0]); + csum = ip_csum_add_even (csum, ip6->src_address.as_u64[1]); + csum = ip_csum_add_even (csum, ip6->dst_address.as_u64[0]); + csum = ip_csum_add_even (csum, ip6->dst_address.as_u64[1]); + csum = ip_csum_add_even (csum, udp->src_port); + csum = ip_csum_add_even (csum, udp->dst_port); + *checksum = ip_csum_fold (csum); + + return 0; +} + +static int +nat64_in2out_icmp_hairpinning (vlib_main_t * vm, vlib_buffer_t * b, + ip6_header_t * ip6) +{ + nat64_main_t *nm = &nat64_main; + nat64_db_bib_entry_t *bibe; + nat64_db_st_entry_t *ste; + icmp46_header_t *icmp = ip6_next_header (ip6); + ip6_header_t *inner_ip6; + ip46_address_t saddr, daddr; + u32 sw_if_index, fib_index; + u8 proto; + udp_header_t *udp; + tcp_header_t *tcp; + u16 *checksum, sport, dport; + ip_csum_t csum; + + if (icmp->type == ICMP6_echo_request || icmp->type == ICMP6_echo_reply) + return -1; + + inner_ip6 = (ip6_header_t *) u8_ptr_add (icmp, 8); + + proto = inner_ip6->protocol; + + if (proto == IP_PROTOCOL_ICMP6) + return -1; + + sw_if_index = vnet_buffer (b)->sw_if_index[VLIB_RX]; + fib_index = + fib_table_get_index_for_sw_if_index (FIB_PROTOCOL_IP6, sw_if_index); + + saddr.as_u64[0] = inner_ip6->src_address.as_u64[0]; + saddr.as_u64[1] = inner_ip6->src_address.as_u64[1]; + daddr.as_u64[0] = inner_ip6->dst_address.as_u64[0]; + daddr.as_u64[1] = inner_ip6->dst_address.as_u64[1]; + + udp = ip6_next_header (inner_ip6); + tcp = ip6_next_header (inner_ip6); + + sport = udp->src_port; + dport = udp->dst_port; + + if (proto == IP_PROTOCOL_UDP) + checksum = &udp->checksum; + else + checksum = &tcp->checksum; + + csum = ip_csum_sub_even (*checksum, inner_ip6->src_address.as_u64[0]); + csum = ip_csum_sub_even (csum, inner_ip6->src_address.as_u64[1]); + csum = ip_csum_sub_even (csum, inner_ip6->dst_address.as_u64[0]); + csum = ip_csum_sub_even (csum, inner_ip6->dst_address.as_u64[1]); + csum = ip_csum_sub_even (csum, sport); + csum = ip_csum_sub_even (csum, dport); + + ste = + nat64_db_st_entry_find (&nm->db, &daddr, &saddr, dport, sport, proto, + fib_index, 1); + if (!ste) + return -1; + + bibe = nat64_db_bib_entry_by_index (&nm->db, proto, ste->bibe_index); + if (!bibe) + return -1; + + dport = udp->dst_port = bibe->out_port; + nat64_compose_ip6 (&inner_ip6->dst_address, &bibe->out_addr, fib_index); + + memset (&saddr, 0, sizeof (saddr)); + memset (&daddr, 0, sizeof (daddr)); + saddr.ip4.as_u32 = ste->out_r_addr.as_u32; + daddr.ip4.as_u32 = bibe->out_addr.as_u32; + + ste = + nat64_db_st_entry_find (&nm->db, &saddr, &daddr, sport, dport, proto, 0, + 0); + if (!ste) + return -1; + + bibe = nat64_db_bib_entry_by_index (&nm->db, proto, ste->bibe_index); + if (!bibe) + return -1; + + inner_ip6->src_address.as_u64[0] = bibe->in_addr.as_u64[0]; + inner_ip6->src_address.as_u64[1] = bibe->in_addr.as_u64[1]; + udp->src_port = bibe->in_port; + + csum = ip_csum_add_even (csum, inner_ip6->src_address.as_u64[0]); + csum = ip_csum_add_even (csum, inner_ip6->src_address.as_u64[1]); + csum = ip_csum_add_even (csum, inner_ip6->dst_address.as_u64[0]); + csum = ip_csum_add_even (csum, inner_ip6->dst_address.as_u64[1]); + csum = ip_csum_add_even (csum, udp->src_port); + csum = ip_csum_add_even (csum, udp->dst_port); + *checksum = ip_csum_fold (csum); + + if (!vec_len (nm->addr_pool)) + return -1; + + nat64_compose_ip6 (&ip6->src_address, &nm->addr_pool[0].addr, fib_index); + ip6->dst_address.as_u64[0] = inner_ip6->src_address.as_u64[0]; + ip6->dst_address.as_u64[1] = inner_ip6->src_address.as_u64[1]; + + icmp->checksum = 0; + csum = ip_csum_with_carry (0, ip6->payload_length); + csum = ip_csum_with_carry (csum, clib_host_to_net_u16 (ip6->protocol)); + csum = ip_csum_with_carry (csum, ip6->src_address.as_u64[0]); + csum = ip_csum_with_carry (csum, ip6->src_address.as_u64[1]); + csum = ip_csum_with_carry (csum, ip6->dst_address.as_u64[0]); + csum = ip_csum_with_carry (csum, ip6->dst_address.as_u64[1]); + csum = + ip_incremental_checksum (csum, icmp, + clib_net_to_host_u16 (ip6->payload_length)); + icmp->checksum = ~ip_csum_fold (csum); + + return 0; +} + +static int +nat64_in2out_unk_proto_hairpinning (vlib_main_t * vm, vlib_buffer_t * b, + ip6_header_t * ip6) +{ + nat64_main_t *nm = &nat64_main; + nat64_db_bib_entry_t *bibe; + nat64_db_st_entry_t *ste; + ip46_address_t saddr, daddr, addr; + u32 sw_if_index, fib_index; + u8 proto = ip6->protocol; + int i; + + sw_if_index = vnet_buffer (b)->sw_if_index[VLIB_RX]; + fib_index = + fib_table_get_index_for_sw_if_index (FIB_PROTOCOL_IP6, sw_if_index); + + saddr.as_u64[0] = ip6->src_address.as_u64[0]; + saddr.as_u64[1] = ip6->src_address.as_u64[1]; + daddr.as_u64[0] = ip6->dst_address.as_u64[0]; + daddr.as_u64[1] = ip6->dst_address.as_u64[1]; + + ste = + nat64_db_st_entry_find (&nm->db, &saddr, &daddr, 0, 0, proto, fib_index, + 1); + + if (ste) + { + bibe = nat64_db_bib_entry_by_index (&nm->db, proto, ste->bibe_index); + if (!bibe) + return -1; + } + else + { + bibe = + nat64_db_bib_entry_find (&nm->db, &saddr, 0, proto, fib_index, 1); + + if (!bibe) + { + /* Choose same out address as for TCP/UDP session to same dst */ + unk_proto_st_walk_ctx_t ctx = { + .src_addr.as_u64[0] = ip6->src_address.as_u64[0], + .src_addr.as_u64[1] = ip6->src_address.as_u64[1], + .dst_addr.as_u64[0] = ip6->dst_address.as_u64[0], + .dst_addr.as_u64[1] = ip6->dst_address.as_u64[1], + .out_addr.as_u32 = 0, + .fib_index = fib_index, + .proto = proto, + }; + + nat64_db_st_walk (&nm->db, IP_PROTOCOL_TCP, unk_proto_st_walk, + &ctx); + + if (!ctx.out_addr.as_u32) + nat64_db_st_walk (&nm->db, IP_PROTOCOL_UDP, unk_proto_st_walk, + &ctx); + + /* Verify if out address is not already in use for protocol */ + memset (&addr, 0, sizeof (addr)); + addr.ip4.as_u32 = ctx.out_addr.as_u32; + if (nat64_db_bib_entry_find (&nm->db, &addr, 0, proto, 0, 0)) + ctx.out_addr.as_u32 = 0; + + if (!ctx.out_addr.as_u32) + { + for (i = 0; i < vec_len (nm->addr_pool); i++) + { + addr.ip4.as_u32 = nm->addr_pool[i].addr.as_u32; + if (!nat64_db_bib_entry_find + (&nm->db, &addr, 0, proto, 0, 0)) + break; + } + } + + if (!ctx.out_addr.as_u32) + return -1; + + bibe = + nat64_db_bib_entry_create (&nm->db, &ip6->src_address, + &ctx.out_addr, 0, 0, fib_index, proto, + 0); + if (!bibe) + return -1; + } + + nat64_extract_ip4 (&ip6->dst_address, &daddr.ip4, fib_index); + ste = + nat64_db_st_entry_create (&nm->db, bibe, &ip6->dst_address, + &daddr.ip4, 0); + if (!ste) + return -1; + } + + nat64_session_reset_timeout (ste, vm); + + nat64_compose_ip6 (&ip6->src_address, &bibe->out_addr, fib_index); + + memset (&saddr, 0, sizeof (saddr)); + memset (&daddr, 0, sizeof (daddr)); + saddr.ip4.as_u32 = bibe->out_addr.as_u32; + daddr.ip4.as_u32 = ste->out_r_addr.as_u32; + + ste = nat64_db_st_entry_find (&nm->db, &daddr, &saddr, 0, 0, proto, 0, 0); + + if (ste) + { + bibe = nat64_db_bib_entry_by_index (&nm->db, proto, ste->bibe_index); + if (!bibe) + return -1; + } + else + { + bibe = nat64_db_bib_entry_find (&nm->db, &daddr, 0, proto, 0, 0); + + if (!bibe) + return -1; + + ste = + nat64_db_st_entry_create (&nm->db, bibe, &ip6->src_address, + &saddr.ip4, 0); + } + + ip6->dst_address.as_u64[0] = bibe->in_addr.as_u64[0]; + ip6->dst_address.as_u64[1] = bibe->in_addr.as_u64[1]; + + return 0; +} + +static inline uword +nat64_in2out_node_fn_inline (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame, u8 is_slow_path) +{ + u32 n_left_from, *from, *to_next; + nat64_in2out_next_t next_index; + u32 pkts_processed = 0; + u32 stats_node_index; + + stats_node_index = + is_slow_path ? nat64_in2out_slowpath_node.index : nat64_in2out_node.index; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + u32 next0; + ip6_header_t *ip60; + u16 l4_offset0, frag_offset0; + u8 l4_protocol0; + u32 proto0; + nat64_in2out_set_ctx_t ctx0; + + /* speculatively enqueue b0 to the current next frame */ + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + ip60 = vlib_buffer_get_current (b0); + + ctx0.b = b0; + ctx0.vm = vm; + + next0 = NAT64_IN2OUT_NEXT_IP4_LOOKUP; + + if (PREDICT_FALSE + (ip6_parse + (ip60, b0->current_length, &l4_protocol0, &l4_offset0, + &frag_offset0))) + { + next0 = NAT64_IN2OUT_NEXT_DROP; + b0->error = node->errors[NAT64_IN2OUT_ERROR_UNKNOWN]; + goto trace0; + } + + proto0 = ip_proto_to_snat_proto (l4_protocol0); + if (frag_offset0 != 0) + { + next0 = NAT64_IN2OUT_NEXT_DROP; + b0->error = + node->errors[NAT64_IN2OUT_ERROR_UNSUPPORTED_PROTOCOL]; + goto trace0; + } + + if (is_slow_path) + { + if (PREDICT_TRUE (proto0 == ~0)) + { + if (is_hairpinning (&ip60->dst_address)) + { + next0 = NAT64_IN2OUT_NEXT_IP6_LOOKUP; + if (nat64_in2out_unk_proto_hairpinning (vm, b0, ip60)) + { + next0 = NAT64_IN2OUT_NEXT_DROP; + b0->error = + node->errors[NAT64_IN2OUT_ERROR_NO_TRANSLATION]; + } + goto trace0; + } + + if (ip6_to_ip4 (b0, nat64_in2out_unk_proto_set_cb, &ctx0)) + { + next0 = NAT64_IN2OUT_NEXT_DROP; + b0->error = + node->errors[NAT64_IN2OUT_ERROR_NO_TRANSLATION]; + goto trace0; + } + } + goto trace0; + } + else + { + if (PREDICT_FALSE (proto0 == ~0)) + { + next0 = NAT64_IN2OUT_NEXT_SLOWPATH; + goto trace0; + } + } + + if (proto0 == SNAT_PROTOCOL_ICMP) + { + if (is_hairpinning (&ip60->dst_address)) + { + next0 = NAT64_IN2OUT_NEXT_IP6_LOOKUP; + if (nat64_in2out_icmp_hairpinning (vm, b0, ip60)) + { + next0 = NAT64_IN2OUT_NEXT_DROP; + b0->error = + node->errors[NAT64_IN2OUT_ERROR_NO_TRANSLATION]; + } + goto trace0; + } + + if (icmp6_to_icmp + (b0, nat64_in2out_icmp_set_cb, &ctx0, + nat64_in2out_inner_icmp_set_cb, &ctx0)) + { + next0 = NAT64_IN2OUT_NEXT_DROP; + b0->error = node->errors[NAT64_IN2OUT_ERROR_NO_TRANSLATION]; + goto trace0; + } + } + else if (proto0 == SNAT_PROTOCOL_TCP || proto0 == SNAT_PROTOCOL_UDP) + { + if (is_hairpinning (&ip60->dst_address)) + { + next0 = NAT64_IN2OUT_NEXT_IP6_LOOKUP; + if (nat64_in2out_tcp_udp_hairpinning (vm, b0, ip60)) + { + next0 = NAT64_IN2OUT_NEXT_DROP; + b0->error = + node->errors[NAT64_IN2OUT_ERROR_NO_TRANSLATION]; + } + goto trace0; + } + + if (ip6_to_ip4_tcp_udp + (b0, nat64_in2out_tcp_udp_set_cb, &ctx0, 0)) + { + next0 = NAT64_IN2OUT_NEXT_DROP; + b0->error = node->errors[NAT64_IN2OUT_ERROR_NO_TRANSLATION]; + goto trace0; + } + } + + trace0: + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) + && (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + nat64_in2out_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->sw_if_index = vnet_buffer (b0)->sw_if_index[VLIB_RX]; + t->next_index = next0; + t->is_slow_path = is_slow_path; + } + + pkts_processed += next0 != NAT64_IN2OUT_NEXT_DROP; + + /* verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + vlib_node_increment_counter (vm, stats_node_index, + NAT64_IN2OUT_ERROR_IN2OUT_PACKETS, + pkts_processed); + return frame->n_vectors; +} + +static uword +nat64_in2out_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return nat64_in2out_node_fn_inline (vm, node, frame, 0); +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (nat64_in2out_node) = { + .function = nat64_in2out_node_fn, + .name = "nat64-in2out", + .vector_size = sizeof (u32), + .format_trace = format_nat64_in2out_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN (nat64_in2out_error_strings), + .error_strings = nat64_in2out_error_strings, + .n_next_nodes = NAT64_IN2OUT_N_NEXT, + /* edit / add dispositions here */ + .next_nodes = { + [NAT64_IN2OUT_NEXT_DROP] = "error-drop", + [NAT64_IN2OUT_NEXT_IP4_LOOKUP] = "ip4-lookup", + [NAT64_IN2OUT_NEXT_IP6_LOOKUP] = "ip6-lookup", + [NAT64_IN2OUT_NEXT_SLOWPATH] = "nat64-in2out-slowpath", + }, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (nat64_in2out_node, nat64_in2out_node_fn); + +static uword +nat64_in2out_slowpath_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return nat64_in2out_node_fn_inline (vm, node, frame, 1); +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (nat64_in2out_slowpath_node) = { + .function = nat64_in2out_slowpath_node_fn, + .name = "nat64-in2out-slowpath", + .vector_size = sizeof (u32), + .format_trace = format_nat64_in2out_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN (nat64_in2out_error_strings), + .error_strings = nat64_in2out_error_strings, + .n_next_nodes = NAT64_IN2OUT_N_NEXT, + /* edit / add dispositions here */ + .next_nodes = { + [NAT64_IN2OUT_NEXT_DROP] = "error-drop", + [NAT64_IN2OUT_NEXT_IP4_LOOKUP] = "ip4-lookup", + [NAT64_IN2OUT_NEXT_IP6_LOOKUP] = "ip6-lookup", + [NAT64_IN2OUT_NEXT_SLOWPATH] = "nat64-in2out-slowpath", + }, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (nat64_in2out_slowpath_node, + nat64_in2out_slowpath_node_fn); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/nat/nat64_out2in.c b/src/plugins/nat/nat64_out2in.c new file mode 100644 index 00000000..61e88a7f --- /dev/null +++ b/src/plugins/nat/nat64_out2in.c @@ -0,0 +1,494 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @file + * @brief NAT64 IPv4 to IPv6 translation (otside to inside network) + */ + +#include <nat/nat64.h> +#include <vnet/ip/ip4_to_ip6.h> +#include <vnet/fib/ip4_fib.h> + +typedef struct +{ + u32 sw_if_index; + u32 next_index; +} nat64_out2in_trace_t; + +static u8 * +format_nat64_out2in_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + nat64_out2in_trace_t *t = va_arg (*args, nat64_out2in_trace_t *); + + s = + format (s, "NAT64-out2in: sw_if_index %d, next index %d", t->sw_if_index, + t->next_index); + + return s; +} + +vlib_node_registration_t nat64_out2in_node; + +#define foreach_nat64_out2in_error \ +_(UNSUPPORTED_PROTOCOL, "Unsupported protocol") \ +_(OUT2IN_PACKETS, "Good out2in packets processed") \ +_(NO_TRANSLATION, "No translation") \ +_(UNKNOWN, "unknown") + +typedef enum +{ +#define _(sym,str) NAT64_OUT2IN_ERROR_##sym, + foreach_nat64_out2in_error +#undef _ + NAT64_OUT2IN_N_ERROR, +} nat64_out2in_error_t; + +static char *nat64_out2in_error_strings[] = { +#define _(sym,string) string, + foreach_nat64_out2in_error +#undef _ +}; + +typedef enum +{ + NAT64_OUT2IN_NEXT_LOOKUP, + NAT64_OUT2IN_NEXT_DROP, + NAT64_OUT2IN_N_NEXT, +} nat64_out2in_next_t; + +typedef struct nat64_out2in_set_ctx_t_ +{ + vlib_buffer_t *b; + vlib_main_t *vm; +} nat64_out2in_set_ctx_t; + +static int +nat64_out2in_tcp_udp_set_cb (ip4_header_t * ip4, ip6_header_t * ip6, + void *arg) +{ + nat64_main_t *nm = &nat64_main; + nat64_out2in_set_ctx_t *ctx = arg; + nat64_db_bib_entry_t *bibe; + nat64_db_st_entry_t *ste; + ip46_address_t saddr, daddr; + ip6_address_t ip6_saddr; + udp_header_t *udp = ip4_next_header (ip4); + tcp_header_t *tcp = ip4_next_header (ip4); + u8 proto = ip4->protocol; + u16 dport = udp->dst_port; + u16 sport = udp->src_port; + u32 sw_if_index, fib_index; + u16 *checksum; + ip_csum_t csum; + + sw_if_index = vnet_buffer (ctx->b)->sw_if_index[VLIB_RX]; + fib_index = ip4_fib_table_get_index_for_sw_if_index (sw_if_index); + + memset (&saddr, 0, sizeof (saddr)); + saddr.ip4.as_u32 = ip4->src_address.as_u32; + memset (&daddr, 0, sizeof (daddr)); + daddr.ip4.as_u32 = ip4->dst_address.as_u32; + + ste = + nat64_db_st_entry_find (&nm->db, &daddr, &saddr, dport, sport, proto, + fib_index, 0); + if (ste) + { + bibe = nat64_db_bib_entry_by_index (&nm->db, proto, ste->bibe_index); + if (!bibe) + return -1; + } + else + { + bibe = + nat64_db_bib_entry_find (&nm->db, &daddr, dport, proto, fib_index, 0); + + if (!bibe) + return -1; + + nat64_compose_ip6 (&ip6_saddr, &ip4->src_address, bibe->fib_index); + ste = + nat64_db_st_entry_create (&nm->db, bibe, &ip6_saddr, &saddr.ip4, + sport); + } + + nat64_session_reset_timeout (ste, ctx->vm); + + ip6->src_address.as_u64[0] = ste->in_r_addr.as_u64[0]; + ip6->src_address.as_u64[1] = ste->in_r_addr.as_u64[1]; + + ip6->dst_address.as_u64[0] = bibe->in_addr.as_u64[0]; + ip6->dst_address.as_u64[1] = bibe->in_addr.as_u64[1]; + udp->dst_port = bibe->in_port; + + if (proto == IP_PROTOCOL_UDP) + checksum = &udp->checksum; + else + checksum = &tcp->checksum; + csum = ip_csum_sub_even (*checksum, dport); + csum = ip_csum_add_even (csum, udp->dst_port); + *checksum = ip_csum_fold (csum); + + vnet_buffer (ctx->b)->sw_if_index[VLIB_TX] = bibe->fib_index; + + return 0; +} + +static int +nat64_out2in_icmp_set_cb (ip4_header_t * ip4, ip6_header_t * ip6, void *arg) +{ + nat64_main_t *nm = &nat64_main; + nat64_out2in_set_ctx_t *ctx = arg; + nat64_db_bib_entry_t *bibe; + nat64_db_st_entry_t *ste; + ip46_address_t saddr, daddr; + ip6_address_t ip6_saddr; + u32 sw_if_index, fib_index; + icmp46_header_t *icmp = ip4_next_header (ip4); + + sw_if_index = vnet_buffer (ctx->b)->sw_if_index[VLIB_RX]; + fib_index = ip4_fib_table_get_index_for_sw_if_index (sw_if_index); + + memset (&saddr, 0, sizeof (saddr)); + saddr.ip4.as_u32 = ip4->src_address.as_u32; + memset (&daddr, 0, sizeof (daddr)); + daddr.ip4.as_u32 = ip4->dst_address.as_u32; + + if (icmp->type == ICMP6_echo_request || icmp->type == ICMP6_echo_reply) + { + u16 out_id = ((u16 *) (icmp))[2]; + ste = + nat64_db_st_entry_find (&nm->db, &daddr, &saddr, out_id, 0, + IP_PROTOCOL_ICMP, fib_index, 0); + + if (ste) + { + bibe = + nat64_db_bib_entry_by_index (&nm->db, IP_PROTOCOL_ICMP, + ste->bibe_index); + if (!bibe) + return -1; + } + else + { + bibe = + nat64_db_bib_entry_find (&nm->db, &daddr, out_id, + IP_PROTOCOL_ICMP, fib_index, 0); + if (!bibe) + return -1; + + nat64_compose_ip6 (&ip6_saddr, &ip4->src_address, bibe->fib_index); + ste = + nat64_db_st_entry_create (&nm->db, bibe, &ip6_saddr, &saddr.ip4, + 0); + } + + nat64_session_reset_timeout (ste, ctx->vm); + + ip6->src_address.as_u64[0] = ste->in_r_addr.as_u64[0]; + ip6->src_address.as_u64[1] = ste->in_r_addr.as_u64[1]; + + ip6->dst_address.as_u64[0] = bibe->in_addr.as_u64[0]; + ip6->dst_address.as_u64[1] = bibe->in_addr.as_u64[1]; + ((u16 *) (icmp))[2] = bibe->in_port; + + vnet_buffer (ctx->b)->sw_if_index[VLIB_TX] = bibe->fib_index; + } + else + { + ip6_header_t *inner_ip6 = (ip6_header_t *) u8_ptr_add (icmp, 8); + + nat64_compose_ip6 (&ip6->src_address, &ip4->src_address, + vnet_buffer (ctx->b)->sw_if_index[VLIB_TX]); + ip6->dst_address.as_u64[0] = inner_ip6->src_address.as_u64[0]; + ip6->dst_address.as_u64[1] = inner_ip6->src_address.as_u64[1]; + } + + return 0; +} + +static int +nat64_out2in_inner_icmp_set_cb (ip4_header_t * ip4, ip6_header_t * ip6, + void *arg) +{ + nat64_main_t *nm = &nat64_main; + nat64_out2in_set_ctx_t *ctx = arg; + nat64_db_bib_entry_t *bibe; + nat64_db_st_entry_t *ste; + ip46_address_t saddr, daddr; + u32 sw_if_index, fib_index; + u8 proto = ip4->protocol; + + sw_if_index = vnet_buffer (ctx->b)->sw_if_index[VLIB_RX]; + fib_index = + fib_table_get_index_for_sw_if_index (FIB_PROTOCOL_IP6, sw_if_index); + + memset (&saddr, 0, sizeof (saddr)); + saddr.ip4.as_u32 = ip4->src_address.as_u32; + memset (&daddr, 0, sizeof (daddr)); + daddr.ip4.as_u32 = ip4->dst_address.as_u32; + + if (proto == IP_PROTOCOL_ICMP6) + { + icmp46_header_t *icmp = ip4_next_header (ip4); + u16 out_id = ((u16 *) (icmp))[2]; + proto = IP_PROTOCOL_ICMP; + + if (! + (icmp->type == ICMP6_echo_request + || icmp->type == ICMP6_echo_reply)) + return -1; + + ste = + nat64_db_st_entry_find (&nm->db, &saddr, &daddr, out_id, 0, proto, + fib_index, 0); + if (!ste) + return -1; + + bibe = nat64_db_bib_entry_by_index (&nm->db, proto, ste->bibe_index); + if (!bibe) + return -1; + + ip6->dst_address.as_u64[0] = ste->in_r_addr.as_u64[0]; + ip6->dst_address.as_u64[1] = ste->in_r_addr.as_u64[1]; + ip6->src_address.as_u64[0] = bibe->in_addr.as_u64[0]; + ip6->src_address.as_u64[1] = bibe->in_addr.as_u64[1]; + ((u16 *) (icmp))[2] = bibe->in_port; + + vnet_buffer (ctx->b)->sw_if_index[VLIB_TX] = bibe->fib_index; + } + else + { + udp_header_t *udp = ip4_next_header (ip4); + tcp_header_t *tcp = ip4_next_header (ip4); + u16 dport = udp->dst_port; + u16 sport = udp->src_port; + u16 *checksum; + ip_csum_t csum; + + ste = + nat64_db_st_entry_find (&nm->db, &saddr, &daddr, sport, dport, proto, + fib_index, 0); + if (!ste) + return -1; + + bibe = nat64_db_bib_entry_by_index (&nm->db, proto, ste->bibe_index); + if (!bibe) + return -1; + + nat64_compose_ip6 (&ip6->dst_address, &daddr.ip4, bibe->fib_index); + ip6->src_address.as_u64[0] = bibe->in_addr.as_u64[0]; + ip6->src_address.as_u64[1] = bibe->in_addr.as_u64[1]; + udp->src_port = bibe->in_port; + + if (proto == IP_PROTOCOL_UDP) + checksum = &udp->checksum; + else + checksum = &tcp->checksum; + if (*checksum) + { + csum = ip_csum_sub_even (*checksum, sport); + csum = ip_csum_add_even (csum, udp->src_port); + *checksum = ip_csum_fold (csum); + } + + vnet_buffer (ctx->b)->sw_if_index[VLIB_TX] = bibe->fib_index; + } + + return 0; +} + +static int +nat64_out2in_unk_proto_set_cb (ip4_header_t * ip4, ip6_header_t * ip6, + void *arg) +{ + nat64_main_t *nm = &nat64_main; + nat64_out2in_set_ctx_t *ctx = arg; + nat64_db_bib_entry_t *bibe; + nat64_db_st_entry_t *ste; + ip46_address_t saddr, daddr; + ip6_address_t ip6_saddr; + u32 sw_if_index, fib_index; + u8 proto = ip4->protocol; + + sw_if_index = vnet_buffer (ctx->b)->sw_if_index[VLIB_RX]; + fib_index = ip4_fib_table_get_index_for_sw_if_index (sw_if_index); + + memset (&saddr, 0, sizeof (saddr)); + saddr.ip4.as_u32 = ip4->src_address.as_u32; + memset (&daddr, 0, sizeof (daddr)); + daddr.ip4.as_u32 = ip4->dst_address.as_u32; + + ste = + nat64_db_st_entry_find (&nm->db, &daddr, &saddr, 0, 0, proto, fib_index, + 0); + if (ste) + { + bibe = nat64_db_bib_entry_by_index (&nm->db, proto, ste->bibe_index); + if (!bibe) + return -1; + } + else + { + bibe = + nat64_db_bib_entry_find (&nm->db, &daddr, 0, proto, fib_index, 0); + + if (!bibe) + return -1; + + nat64_compose_ip6 (&ip6_saddr, &ip4->src_address, bibe->fib_index); + ste = + nat64_db_st_entry_create (&nm->db, bibe, &ip6_saddr, &saddr.ip4, 0); + } + + nat64_session_reset_timeout (ste, ctx->vm); + + ip6->src_address.as_u64[0] = ste->in_r_addr.as_u64[0]; + ip6->src_address.as_u64[1] = ste->in_r_addr.as_u64[1]; + + ip6->dst_address.as_u64[0] = bibe->in_addr.as_u64[0]; + ip6->dst_address.as_u64[1] = bibe->in_addr.as_u64[1]; + + vnet_buffer (ctx->b)->sw_if_index[VLIB_TX] = bibe->fib_index; + + return 0; +} + +static uword +nat64_out2in_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + u32 n_left_from, *from, *to_next; + nat64_out2in_next_t next_index; + u32 pkts_processed = 0; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + u32 next0; + ip4_header_t *ip40; + u32 proto0; + nat64_out2in_set_ctx_t ctx0; + + /* speculatively enqueue b0 to the current next frame */ + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + ip40 = vlib_buffer_get_current (b0); + + ctx0.b = b0; + ctx0.vm = vm; + + next0 = NAT64_OUT2IN_NEXT_LOOKUP; + + proto0 = ip_proto_to_snat_proto (ip40->protocol); + + if (proto0 == SNAT_PROTOCOL_ICMP) + { + if (icmp_to_icmp6 + (b0, nat64_out2in_icmp_set_cb, &ctx0, + nat64_out2in_inner_icmp_set_cb, &ctx0)) + { + next0 = NAT64_OUT2IN_NEXT_DROP; + b0->error = node->errors[NAT64_OUT2IN_ERROR_NO_TRANSLATION]; + goto trace0; + } + } + else if (proto0 == SNAT_PROTOCOL_TCP || proto0 == SNAT_PROTOCOL_UDP) + { + if (ip4_to_ip6_tcp_udp (b0, nat64_out2in_tcp_udp_set_cb, &ctx0)) + { + next0 = NAT64_OUT2IN_NEXT_DROP; + b0->error = node->errors[NAT64_OUT2IN_ERROR_NO_TRANSLATION]; + goto trace0; + } + } + else + { + if (ip4_to_ip6 (b0, nat64_out2in_unk_proto_set_cb, &ctx0)) + { + next0 = NAT64_OUT2IN_NEXT_DROP; + b0->error = node->errors[NAT64_OUT2IN_ERROR_NO_TRANSLATION]; + goto trace0; + } + } + + trace0: + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) + && (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + nat64_out2in_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->sw_if_index = vnet_buffer (b0)->sw_if_index[VLIB_RX]; + t->next_index = next0; + } + + pkts_processed += next0 != NAT64_OUT2IN_NEXT_DROP; + + /* verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + vlib_node_increment_counter (vm, nat64_out2in_node.index, + NAT64_OUT2IN_ERROR_OUT2IN_PACKETS, + pkts_processed); + return frame->n_vectors; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (nat64_out2in_node) = { + .function = nat64_out2in_node_fn, + .name = "nat64-out2in", + .vector_size = sizeof (u32), + .format_trace = format_nat64_out2in_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN (nat64_out2in_error_strings), + .error_strings = nat64_out2in_error_strings,.n_next_nodes = 2, + /* edit / add dispositions here */ + .next_nodes = { + [NAT64_OUT2IN_NEXT_DROP] = "error-drop", + [NAT64_OUT2IN_NEXT_LOOKUP] = "ip6-lookup", + }, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (nat64_out2in_node, nat64_out2in_node_fn); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/nat/nat_all_api_h.h b/src/plugins/nat/nat_all_api_h.h new file mode 100644 index 00000000..acd9ba1c --- /dev/null +++ b/src/plugins/nat/nat_all_api_h.h @@ -0,0 +1,19 @@ + +/* + * nat_all_api_h.h - skeleton vpp engine plug-in api #include file + * + * Copyright (c) <current-year> <your-organization> + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* Include the generated file, see BUILT_SOURCES in Makefile.am */ +#include <nat/nat.api.h> diff --git a/src/plugins/nat/nat_api.c b/src/plugins/nat/nat_api.c new file mode 100644 index 00000000..b56b4436 --- /dev/null +++ b/src/plugins/nat/nat_api.c @@ -0,0 +1,3396 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @file + * @brief NAT plugin API implementation + */ + +#include <nat/nat.h> +#include <nat/nat_det.h> +#include <nat/nat64.h> +#include <vlibapi/api.h> +#include <vlibmemory/api.h> +#include <vlibsocket/api.h> +#include <nat/nat_msg_enum.h> +#include <vnet/fib/fib_table.h> + +#define vl_api_nat44_lb_addr_port_t_endian vl_noop_handler +#define vl_api_nat44_add_del_lb_static_mapping_t_endian vl_noop_handler +#define vl_api_nat44_nat44_lb_static_mapping_details_t_endian vl_noop_handler + +/* define message structures */ +#define vl_typedefs +#include <nat/nat_all_api_h.h> +#undef vl_typedefs + +/* define generated endian-swappers */ +#define vl_endianfun +#include <nat/nat_all_api_h.h> +#undef vl_endianfun + +#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__) + +#define REPLY_MSG_ID_BASE sm->msg_id_base +#include <vlibapi/api_helper_macros.h> + +/* Get the API version number */ +#define vl_api_version(n,v) static u32 api_version=(v); +#include <nat/nat_all_api_h.h> +#undef vl_api_version + +/* Macro to finish up custom dump fns */ +#define FINISH \ + vec_add1 (s, 0); \ + vl_print (handle, (char *)s); \ + vec_free (s); \ + return handle; + +static void + vl_api_snat_add_address_range_t_handler + (vl_api_snat_add_address_range_t * mp) +{ + snat_main_t *sm = &snat_main; + vl_api_snat_add_address_range_reply_t *rmp; + ip4_address_t this_addr; + u32 start_host_order, end_host_order; + u32 vrf_id; + int i, count; + int rv = 0; + u32 *tmp; + + if (mp->is_ip4 != 1) + { + rv = VNET_API_ERROR_UNIMPLEMENTED; + goto send_reply; + } + + if (sm->static_mapping_only) + { + rv = VNET_API_ERROR_FEATURE_DISABLED; + goto send_reply; + } + + tmp = (u32 *) mp->first_ip_address; + start_host_order = clib_host_to_net_u32 (tmp[0]); + tmp = (u32 *) mp->last_ip_address; + end_host_order = clib_host_to_net_u32 (tmp[0]); + + count = (end_host_order - start_host_order) + 1; + + vrf_id = clib_host_to_net_u32 (mp->vrf_id); + + if (count > 1024) + clib_warning ("%U - %U, %d addresses...", + format_ip4_address, mp->first_ip_address, + format_ip4_address, mp->last_ip_address, count); + + memcpy (&this_addr.as_u8, mp->first_ip_address, 4); + + for (i = 0; i < count; i++) + { + if (mp->is_add) + snat_add_address (sm, &this_addr, vrf_id); + else + rv = snat_del_address (sm, this_addr, 0); + + if (rv) + goto send_reply; + + increment_v4_address (&this_addr); + } + +send_reply: + REPLY_MACRO (VL_API_SNAT_ADD_ADDRESS_RANGE_REPLY); +} + +static void *vl_api_snat_add_address_range_t_print + (vl_api_snat_add_address_range_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: snat_add_address_range "); + s = format (s, "%U ", format_ip4_address, mp->first_ip_address); + if (memcmp (mp->first_ip_address, mp->last_ip_address, 4)) + { + s = format (s, " - %U ", format_ip4_address, mp->last_ip_address); + } + FINISH; +} + +static void + send_snat_address_details + (snat_address_t * a, unix_shared_memory_queue_t * q, u32 context) +{ + vl_api_snat_address_details_t *rmp; + snat_main_t *sm = &snat_main; + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (VL_API_SNAT_ADDRESS_DETAILS + sm->msg_id_base); + rmp->is_ip4 = 1; + clib_memcpy (rmp->ip_address, &(a->addr), 4); + if (a->fib_index != ~0) + { + fib_table_t *fib = fib_table_get (a->fib_index, FIB_PROTOCOL_IP4); + rmp->vrf_id = ntohl (fib->ft_table_id); + } + else + rmp->vrf_id = ~0; + rmp->context = context; + + vl_msg_api_send_shmem (q, (u8 *) & rmp); +} + +static void +vl_api_snat_address_dump_t_handler (vl_api_snat_address_dump_t * mp) +{ + unix_shared_memory_queue_t *q; + snat_main_t *sm = &snat_main; + snat_address_t *a; + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (q == 0) + return; + + /* *INDENT-OFF* */ + vec_foreach (a, sm->addresses) + send_snat_address_details (a, q, mp->context); + /* *INDENT-ON* */ +} + +static void *vl_api_snat_address_dump_t_print + (vl_api_snat_address_dump_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: snat_address_dump "); + + FINISH; +} + +static void + vl_api_snat_interface_add_del_feature_t_handler + (vl_api_snat_interface_add_del_feature_t * mp) +{ + snat_main_t *sm = &snat_main; + vl_api_snat_interface_add_del_feature_reply_t *rmp; + u8 is_del = mp->is_add == 0; + u32 sw_if_index = ntohl (mp->sw_if_index); + int rv = 0; + + VALIDATE_SW_IF_INDEX (mp); + + rv = snat_interface_add_del (sw_if_index, mp->is_inside, is_del); + + BAD_SW_IF_INDEX_LABEL; + + REPLY_MACRO (VL_API_SNAT_INTERFACE_ADD_DEL_FEATURE_REPLY); +} + +static void *vl_api_snat_interface_add_del_feature_t_print + (vl_api_snat_interface_add_del_feature_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: snat_interface_add_del_feature "); + s = format (s, "sw_if_index %d %s %s", + clib_host_to_net_u32 (mp->sw_if_index), + mp->is_inside ? "in" : "out", mp->is_add ? "" : "del"); + + FINISH; +} + +static void + send_snat_interface_details + (snat_interface_t * i, unix_shared_memory_queue_t * q, u32 context) +{ + vl_api_snat_interface_details_t *rmp; + snat_main_t *sm = &snat_main; + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (VL_API_SNAT_INTERFACE_DETAILS + sm->msg_id_base); + rmp->sw_if_index = ntohl (i->sw_if_index); + rmp->is_inside = i->is_inside; + rmp->context = context; + + vl_msg_api_send_shmem (q, (u8 *) & rmp); +} + +static void +vl_api_snat_interface_dump_t_handler (vl_api_snat_interface_dump_t * mp) +{ + unix_shared_memory_queue_t *q; + snat_main_t *sm = &snat_main; + snat_interface_t *i; + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (q == 0) + return; + + /* *INDENT-OFF* */ + pool_foreach (i, sm->interfaces, + ({ + send_snat_interface_details(i, q, mp->context); + })); + /* *INDENT-ON* */ +} + +static void *vl_api_snat_interface_dump_t_print + (vl_api_snat_interface_dump_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: snat_interface_dump "); + + FINISH; +} + +static void + vl_api_snat_interface_add_del_output_feature_t_handler + (vl_api_snat_interface_add_del_output_feature_t * mp) +{ + snat_main_t *sm = &snat_main; + vl_api_snat_interface_add_del_output_feature_reply_t *rmp; + u8 is_del = mp->is_add == 0; + u32 sw_if_index = ntohl (mp->sw_if_index); + int rv = 0; + + VALIDATE_SW_IF_INDEX (mp); + + rv = snat_interface_add_del_output_feature (sw_if_index, mp->is_inside, + is_del); + + BAD_SW_IF_INDEX_LABEL; + + REPLY_MACRO (VL_API_SNAT_INTERFACE_ADD_DEL_OUTPUT_FEATURE_REPLY); +} + +static void *vl_api_snat_interface_add_del_output_feature_t_print + (vl_api_snat_interface_add_del_output_feature_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: snat_interface_add_del_output_feature "); + s = format (s, "sw_if_index %d %s %s", + clib_host_to_net_u32 (mp->sw_if_index), + mp->is_inside ? "in" : "out", mp->is_add ? "" : "del"); + + FINISH; +} + +static void +send_snat_interface_output_feature_details (snat_interface_t * i, + unix_shared_memory_queue_t * q, + u32 context) +{ + vl_api_snat_interface_output_feature_details_t *rmp; + snat_main_t *sm = &snat_main; + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = + ntohs (VL_API_SNAT_INTERFACE_OUTPUT_FEATURE_DETAILS + sm->msg_id_base); + rmp->sw_if_index = ntohl (i->sw_if_index); + rmp->context = context; + rmp->is_inside = i->is_inside; + + vl_msg_api_send_shmem (q, (u8 *) & rmp); +} + +static void + vl_api_snat_interface_output_feature_dump_t_handler + (vl_api_snat_interface_output_feature_dump_t * mp) +{ + unix_shared_memory_queue_t *q; + snat_main_t *sm = &snat_main; + snat_interface_t *i; + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (q == 0) + return; + + /* *INDENT-OFF* */ + pool_foreach (i, sm->output_feature_interfaces, + ({ + send_snat_interface_output_feature_details(i, q, mp->context); + })); + /* *INDENT-ON* */ +} + +static void *vl_api_snat_interface_output_feature_dump_t_print + (vl_api_snat_interface_output_feature_dump_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: snat_interface_output_feature_dump "); + + FINISH; +} + +static void + vl_api_snat_add_static_mapping_t_handler + (vl_api_snat_add_static_mapping_t * mp) +{ + snat_main_t *sm = &snat_main; + vl_api_snat_add_static_mapping_reply_t *rmp; + ip4_address_t local_addr, external_addr; + u16 local_port = 0, external_port = 0; + u32 vrf_id, external_sw_if_index; + int rv = 0; + snat_protocol_t proto; + + if (mp->is_ip4 != 1) + { + rv = VNET_API_ERROR_UNIMPLEMENTED; + goto send_reply; + } + + memcpy (&local_addr.as_u8, mp->local_ip_address, 4); + memcpy (&external_addr.as_u8, mp->external_ip_address, 4); + if (mp->addr_only == 0) + { + local_port = clib_net_to_host_u16 (mp->local_port); + external_port = clib_net_to_host_u16 (mp->external_port); + } + vrf_id = clib_net_to_host_u32 (mp->vrf_id); + external_sw_if_index = clib_net_to_host_u32 (mp->external_sw_if_index); + proto = ip_proto_to_snat_proto (mp->protocol); + + rv = snat_add_static_mapping (local_addr, external_addr, local_port, + external_port, vrf_id, mp->addr_only, + external_sw_if_index, proto, mp->is_add); + +send_reply: + REPLY_MACRO (VL_API_SNAT_ADD_ADDRESS_RANGE_REPLY); +} + +static void *vl_api_snat_add_static_mapping_t_print + (vl_api_snat_add_static_mapping_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: snat_add_static_mapping "); + s = format (s, "protocol %d local_addr %U external_addr %U ", + mp->protocol, + format_ip4_address, mp->local_ip_address, + format_ip4_address, mp->external_ip_address); + + if (mp->addr_only == 0) + s = format (s, "local_port %d external_port %d ", + clib_net_to_host_u16 (mp->local_port), + clib_net_to_host_u16 (mp->external_port)); + + if (mp->vrf_id != ~0) + s = format (s, "vrf %d", clib_net_to_host_u32 (mp->vrf_id)); + + if (mp->external_sw_if_index != ~0) + s = format (s, "external_sw_if_index %d", + clib_net_to_host_u32 (mp->external_sw_if_index)); + FINISH; +} + +static void + send_snat_static_mapping_details + (snat_static_mapping_t * m, unix_shared_memory_queue_t * q, u32 context) +{ + vl_api_snat_static_mapping_details_t *rmp; + snat_main_t *sm = &snat_main; + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = + ntohs (VL_API_SNAT_STATIC_MAPPING_DETAILS + sm->msg_id_base); + rmp->is_ip4 = 1; + rmp->addr_only = m->addr_only; + clib_memcpy (rmp->local_ip_address, &(m->local_addr), 4); + clib_memcpy (rmp->external_ip_address, &(m->external_addr), 4); + rmp->local_port = htons (m->local_port); + rmp->external_port = htons (m->external_port); + rmp->external_sw_if_index = ~0; + rmp->vrf_id = htonl (m->vrf_id); + rmp->protocol = snat_proto_to_ip_proto (m->proto); + rmp->context = context; + + vl_msg_api_send_shmem (q, (u8 *) & rmp); +} + +static void + send_snat_static_map_resolve_details + (snat_static_map_resolve_t * m, unix_shared_memory_queue_t * q, u32 context) +{ + vl_api_snat_static_mapping_details_t *rmp; + snat_main_t *sm = &snat_main; + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = + ntohs (VL_API_SNAT_STATIC_MAPPING_DETAILS + sm->msg_id_base); + rmp->is_ip4 = 1; + rmp->addr_only = m->addr_only; + clib_memcpy (rmp->local_ip_address, &(m->l_addr), 4); + rmp->local_port = htons (m->l_port); + rmp->external_port = htons (m->e_port); + rmp->external_sw_if_index = htonl (m->sw_if_index); + rmp->vrf_id = htonl (m->vrf_id); + rmp->protocol = snat_proto_to_ip_proto (m->proto); + rmp->context = context; + + vl_msg_api_send_shmem (q, (u8 *) & rmp); +} + +static void + vl_api_snat_static_mapping_dump_t_handler + (vl_api_snat_static_mapping_dump_t * mp) +{ + unix_shared_memory_queue_t *q; + snat_main_t *sm = &snat_main; + snat_static_mapping_t *m; + snat_static_map_resolve_t *rp; + int j; + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (q == 0) + return; + + /* *INDENT-OFF* */ + pool_foreach (m, sm->static_mappings, + ({ + if (!vec_len(m->locals)) + send_snat_static_mapping_details (m, q, mp->context); + })); + /* *INDENT-ON* */ + + for (j = 0; j < vec_len (sm->to_resolve); j++) + { + rp = sm->to_resolve + j; + send_snat_static_map_resolve_details (rp, q, mp->context); + } +} + +static void *vl_api_snat_static_mapping_dump_t_print + (vl_api_snat_static_mapping_dump_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: snat_static_mapping_dump "); + + FINISH; +} + +static void +vl_api_snat_control_ping_t_handler (vl_api_snat_control_ping_t * mp) +{ + vl_api_snat_control_ping_reply_t *rmp; + snat_main_t *sm = &snat_main; + int rv = 0; + + /* *INDENT-OFF* */ + REPLY_MACRO2 (VL_API_SNAT_CONTROL_PING_REPLY, + ({ + rmp->vpe_pid = ntohl (getpid ()); + })); + /* *INDENT-ON* */ +} + +static void *vl_api_snat_control_ping_t_print + (vl_api_snat_control_ping_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: snat_control_ping "); + + FINISH; +} + +static void +vl_api_snat_show_config_t_handler (vl_api_snat_show_config_t * mp) +{ + vl_api_snat_show_config_reply_t *rmp; + snat_main_t *sm = &snat_main; + int rv = 0; + + /* *INDENT-OFF* */ + REPLY_MACRO2 (VL_API_SNAT_SHOW_CONFIG_REPLY, + ({ + rmp->translation_buckets = htonl (sm->translation_buckets); + rmp->translation_memory_size = htonl (sm->translation_memory_size); + rmp->user_buckets = htonl (sm->user_buckets); + rmp->user_memory_size = htonl (sm->user_memory_size); + rmp->max_translations_per_user = htonl (sm->max_translations_per_user); + rmp->outside_vrf_id = htonl (sm->outside_vrf_id); + rmp->inside_vrf_id = htonl (sm->inside_vrf_id); + rmp->static_mapping_only = sm->static_mapping_only; + rmp->static_mapping_connection_tracking = + sm->static_mapping_connection_tracking; + rmp->deterministic = sm->deterministic; + })); + /* *INDENT-ON* */ +} + +static void *vl_api_snat_show_config_t_print + (vl_api_snat_show_config_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: snat_show_config "); + + FINISH; +} + +static void +vl_api_snat_set_workers_t_handler (vl_api_snat_set_workers_t * mp) +{ + snat_main_t *sm = &snat_main; + vl_api_snat_set_workers_reply_t *rmp; + int rv = 0; + uword *bitmap = 0; + u64 mask = clib_net_to_host_u64 (mp->worker_mask); + + if (sm->num_workers < 2) + { + rv = VNET_API_ERROR_FEATURE_DISABLED; + goto send_reply; + } + + bitmap = clib_bitmap_set_multiple (bitmap, 0, mask, BITS (mask)); + rv = snat_set_workers (bitmap); + clib_bitmap_free (bitmap); + +send_reply: + REPLY_MACRO (VL_API_SNAT_SET_WORKERS_REPLY); +} + +static void *vl_api_snat_set_workers_t_print + (vl_api_snat_set_workers_t * mp, void *handle) +{ + u8 *s; + uword *bitmap = 0; + u8 first = 1; + int i; + u64 mask = clib_net_to_host_u64 (mp->worker_mask); + + s = format (0, "SCRIPT: snat_set_workers "); + bitmap = clib_bitmap_set_multiple (bitmap, 0, mask, BITS (mask)); + /* *INDENT-OFF* */ + clib_bitmap_foreach (i, bitmap, + ({ + if (first) + s = format (s, "%d", i); + else + s = format (s, ",%d", i); + first = 0; + })); + /* *INDENT-ON* */ + clib_bitmap_free (bitmap); + FINISH; +} + +static void + send_snat_worker_details + (u32 worker_index, unix_shared_memory_queue_t * q, u32 context) +{ + vl_api_snat_worker_details_t *rmp; + snat_main_t *sm = &snat_main; + vlib_worker_thread_t *w = + vlib_worker_threads + worker_index + sm->first_worker_index; + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (VL_API_SNAT_WORKER_DETAILS + sm->msg_id_base); + rmp->context = context; + rmp->worker_index = htonl (worker_index); + rmp->lcore_id = htonl (w->lcore_id); + strncpy ((char *) rmp->name, (char *) w->name, ARRAY_LEN (rmp->name) - 1); + + vl_msg_api_send_shmem (q, (u8 *) & rmp); +} + +static void +vl_api_snat_worker_dump_t_handler (vl_api_snat_worker_dump_t * mp) +{ + unix_shared_memory_queue_t *q; + snat_main_t *sm = &snat_main; + u32 *worker_index; + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (q == 0) + return; + + /* *INDENT-OFF* */ + vec_foreach (worker_index, sm->workers) + send_snat_worker_details(*worker_index, q, mp->context); + /* *INDENT-ON* */ +} + +static void *vl_api_snat_worker_dump_t_print + (vl_api_snat_worker_dump_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: snat_worker_dump "); + + FINISH; +} + +static void + vl_api_snat_add_del_interface_addr_t_handler + (vl_api_snat_add_del_interface_addr_t * mp) +{ + snat_main_t *sm = &snat_main; + vl_api_snat_add_del_interface_addr_reply_t *rmp; + u8 is_del = mp->is_add == 0; + u32 sw_if_index = ntohl (mp->sw_if_index); + int rv = 0; + + VALIDATE_SW_IF_INDEX (mp); + + rv = snat_add_interface_address (sm, sw_if_index, is_del); + + BAD_SW_IF_INDEX_LABEL; + + REPLY_MACRO (VL_API_SNAT_ADD_DEL_INTERFACE_ADDR_REPLY); +} + +static void *vl_api_snat_add_del_interface_addr_t_print + (vl_api_snat_add_del_interface_addr_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: snat_add_del_interface_addr "); + s = format (s, "sw_if_index %d %s", + clib_host_to_net_u32 (mp->sw_if_index), + mp->is_add ? "" : "del"); + + FINISH; +} + +static void + send_snat_interface_addr_details + (u32 sw_if_index, unix_shared_memory_queue_t * q, u32 context) +{ + vl_api_snat_interface_addr_details_t *rmp; + snat_main_t *sm = &snat_main; + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = + ntohs (VL_API_SNAT_INTERFACE_ADDR_DETAILS + sm->msg_id_base); + rmp->sw_if_index = ntohl (sw_if_index); + rmp->context = context; + + vl_msg_api_send_shmem (q, (u8 *) & rmp); +} + +static void + vl_api_snat_interface_addr_dump_t_handler + (vl_api_snat_interface_addr_dump_t * mp) +{ + unix_shared_memory_queue_t *q; + snat_main_t *sm = &snat_main; + u32 *i; + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (q == 0) + return; + + /* *INDENT-OFF* */ + vec_foreach (i, sm->auto_add_sw_if_indices) + send_snat_interface_addr_details(*i, q, mp->context); + /* *INDENT-ON* */ +} + +static void *vl_api_snat_interface_addr_dump_t_print + (vl_api_snat_interface_addr_dump_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: snat_interface_addr_dump "); + + FINISH; +} + +static void + vl_api_snat_ipfix_enable_disable_t_handler + (vl_api_snat_ipfix_enable_disable_t * mp) +{ + snat_main_t *sm = &snat_main; + vl_api_snat_ipfix_enable_disable_reply_t *rmp; + int rv = 0; + + rv = snat_ipfix_logging_enable_disable (mp->enable, + clib_host_to_net_u32 + (mp->domain_id), + clib_host_to_net_u16 + (mp->src_port)); + + REPLY_MACRO (VL_API_SNAT_IPFIX_ENABLE_DISABLE_REPLY); +} + +static void *vl_api_snat_ipfix_enable_disable_t_print + (vl_api_snat_ipfix_enable_disable_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: snat_ipfix_enable_disable "); + if (mp->domain_id) + s = format (s, "domain %d ", clib_net_to_host_u32 (mp->domain_id)); + if (mp->src_port) + s = format (s, "src_port %d ", clib_net_to_host_u16 (mp->src_port)); + if (!mp->enable) + s = format (s, "disable "); + + FINISH; +} + +static void + send_snat_user_details + (snat_user_t * u, unix_shared_memory_queue_t * q, u32 context) +{ + vl_api_snat_user_details_t *rmp; + snat_main_t *sm = &snat_main; + fib_table_t *fib = fib_table_get (u->fib_index, FIB_PROTOCOL_IP4); + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (VL_API_SNAT_USER_DETAILS + sm->msg_id_base); + + rmp->vrf_id = ntohl (fib->ft_table_id); + + rmp->is_ip4 = 1; + clib_memcpy (rmp->ip_address, &(u->addr), 4); + rmp->nsessions = ntohl (u->nsessions); + rmp->nstaticsessions = ntohl (u->nstaticsessions); + rmp->context = context; + + vl_msg_api_send_shmem (q, (u8 *) & rmp); +} + +static void +vl_api_snat_user_dump_t_handler (vl_api_snat_user_dump_t * mp) +{ + unix_shared_memory_queue_t *q; + snat_main_t *sm = &snat_main; + snat_main_per_thread_data_t *tsm; + snat_user_t *u; + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (q == 0) + return; + + /* *INDENT-OFF* */ + vec_foreach (tsm, sm->per_thread_data) + vec_foreach (u, tsm->users) + send_snat_user_details (u, q, mp->context); + /* *INDENT-ON* */ +} + +static void *vl_api_snat_user_dump_t_print + (vl_api_snat_user_dump_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: snat_user_dump "); + + FINISH; +} + +static void + send_snat_user_session_details + (snat_session_t * s, unix_shared_memory_queue_t * q, u32 context) +{ + vl_api_snat_user_session_details_t *rmp; + snat_main_t *sm = &snat_main; + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = + ntohs (VL_API_SNAT_USER_SESSION_DETAILS + sm->msg_id_base); + rmp->is_ip4 = 1; + clib_memcpy (rmp->outside_ip_address, (&s->out2in.addr), 4); + clib_memcpy (rmp->inside_ip_address, (&s->in2out.addr), 4); + rmp->is_static = s->flags & SNAT_SESSION_FLAG_STATIC_MAPPING ? 1 : 0; + rmp->last_heard = clib_host_to_net_u64 ((u64) s->last_heard); + rmp->total_bytes = clib_host_to_net_u64 (s->total_bytes); + rmp->total_pkts = ntohl (s->total_pkts); + rmp->context = context; + if (snat_is_unk_proto_session (s)) + { + rmp->outside_port = 0; + rmp->inside_port = 0; + rmp->protocol = ntohs (s->in2out.port); + } + else + { + rmp->outside_port = s->out2in.port; + rmp->inside_port = s->in2out.port; + rmp->protocol = ntohs (snat_proto_to_ip_proto (s->in2out.protocol)); + } + + vl_msg_api_send_shmem (q, (u8 *) & rmp); +} + +static void + vl_api_snat_user_session_dump_t_handler + (vl_api_snat_user_session_dump_t * mp) +{ + unix_shared_memory_queue_t *q; + snat_main_t *sm = &snat_main; + snat_main_per_thread_data_t *tsm; + snat_session_t *s; + clib_bihash_kv_8_8_t key, value; + snat_user_key_t ukey; + snat_user_t *u; + u32 session_index, head_index, elt_index; + dlist_elt_t *head, *elt; + ip4_header_t ip; + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (q == 0) + return; + if (!mp->is_ip4) + return; + + clib_memcpy (&ukey.addr, mp->ip_address, 4); + ip.src_address.as_u32 = ukey.addr.as_u32; + ukey.fib_index = fib_table_find (FIB_PROTOCOL_IP4, ntohl (mp->vrf_id)); + key.key = ukey.as_u64; + if (sm->num_workers) + tsm = + vec_elt_at_index (sm->per_thread_data, + sm->worker_in2out_cb (&ip, ukey.fib_index)); + else + tsm = vec_elt_at_index (sm->per_thread_data, sm->num_workers); + if (clib_bihash_search_8_8 (&tsm->user_hash, &key, &value)) + return; + u = pool_elt_at_index (tsm->users, value.value); + if (!u->nsessions && !u->nstaticsessions) + return; + + head_index = u->sessions_per_user_list_head_index; + head = pool_elt_at_index (tsm->list_pool, head_index); + elt_index = head->next; + elt = pool_elt_at_index (tsm->list_pool, elt_index); + session_index = elt->value; + while (session_index != ~0) + { + s = pool_elt_at_index (tsm->sessions, session_index); + + send_snat_user_session_details (s, q, mp->context); + + elt_index = elt->next; + elt = pool_elt_at_index (tsm->list_pool, elt_index); + session_index = elt->value; + } +} + +static void *vl_api_snat_user_session_dump_t_print + (vl_api_snat_user_session_dump_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: snat_user_session_dump "); + s = format (s, "ip_address %U vrf_id %d\n", + format_ip4_address, mp->ip_address, + clib_net_to_host_u32 (mp->vrf_id)); + + FINISH; +} + +/******************************************************************/ +/*** detrministic NAT/CGN (old, will be deprecated after 17.10) ***/ +/******************************************************************/ + +static void +vl_api_snat_add_det_map_t_handler (vl_api_snat_add_det_map_t * mp) +{ + snat_main_t *sm = &snat_main; + vl_api_snat_add_det_map_reply_t *rmp; + int rv = 0; + ip4_address_t in_addr, out_addr; + + clib_memcpy (&in_addr, mp->in_addr, 4); + clib_memcpy (&out_addr, mp->out_addr, 4); + rv = snat_det_add_map (sm, &in_addr, mp->in_plen, &out_addr, + mp->out_plen, mp->is_add); + + REPLY_MACRO (VL_API_SNAT_ADD_DET_MAP_REPLY); +} + +static void *vl_api_snat_add_det_map_t_print + (vl_api_snat_add_det_map_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: snat_add_det_map "); + s = format (s, "inside address %U/%d outside address %U/%d\n", + format_ip4_address, mp->in_addr, mp->in_plen, + format_ip4_address, mp->out_addr, mp->out_plen); + + FINISH; +} + +static void +vl_api_snat_det_forward_t_handler (vl_api_snat_det_forward_t * mp) +{ + snat_main_t *sm = &snat_main; + vl_api_snat_det_forward_reply_t *rmp; + int rv = 0; + u16 lo_port = 0, hi_port = 0; + snat_det_map_t *dm; + ip4_address_t in_addr, out_addr; + + out_addr.as_u32 = 0; + clib_memcpy (&in_addr, mp->in_addr, 4); + dm = snat_det_map_by_user (sm, &in_addr); + if (!dm) + { + rv = VNET_API_ERROR_NO_SUCH_ENTRY; + goto send_reply; + } + + snat_det_forward (dm, &in_addr, &out_addr, &lo_port); + hi_port = lo_port + dm->ports_per_host - 1; + +send_reply: + /* *INDENT-OFF* */ + REPLY_MACRO2 (VL_API_SNAT_DET_FORWARD_REPLY, + ({ + rmp->out_port_lo = ntohs (lo_port); + rmp->out_port_hi = ntohs (hi_port); + rmp->is_ip4 = 1; + memset (rmp->out_addr, 0, 16); + clib_memcpy (rmp->out_addr, &out_addr, 4); + })) + /* *INDENT-ON* */ +} + +static void *vl_api_snat_det_forward_t_print + (vl_api_snat_det_forward_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: smat_det_forward_t"); + s = format (s, "inside ip address %U\n", format_ip4_address, mp->in_addr); + + FINISH; +} + +static void +vl_api_snat_det_reverse_t_handler (vl_api_snat_det_reverse_t * mp) +{ + snat_main_t *sm = &snat_main; + vl_api_snat_det_reverse_reply_t *rmp; + int rv = 0; + ip4_address_t out_addr, in_addr; + snat_det_map_t *dm; + + in_addr.as_u32 = 0; + clib_memcpy (&out_addr, mp->out_addr, 4); + dm = snat_det_map_by_out (sm, &out_addr); + if (!dm) + { + rv = VNET_API_ERROR_NO_SUCH_ENTRY; + goto send_reply; + } + + snat_det_reverse (dm, &out_addr, htons (mp->out_port), &in_addr); + +send_reply: + /* *INDENT-OFF* */ + REPLY_MACRO2 (VL_API_SNAT_DET_REVERSE_REPLY, + ({ + rmp->is_ip4 = 1; + memset (rmp->in_addr, 0, 16); + clib_memcpy (rmp->in_addr, &in_addr, 4); + })) + /* *INDENT-ON* */ +} + +static void *vl_api_snat_det_reverse_t_print + (vl_api_snat_det_reverse_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: smat_det_reverse_t"); + s = format (s, "outside ip address %U outside port %d", + format_ip4_address, mp->out_addr, ntohs (mp->out_port)); + + FINISH; +} + +static void + sent_snat_det_map_details + (snat_det_map_t * m, unix_shared_memory_queue_t * q, u32 context) +{ + vl_api_snat_det_map_details_t *rmp; + snat_main_t *sm = &snat_main; + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (VL_API_SNAT_DET_MAP_DETAILS + sm->msg_id_base); + rmp->is_ip4 = 1; + clib_memcpy (rmp->in_addr, &m->in_addr, 4); + rmp->in_plen = m->in_plen; + clib_memcpy (rmp->out_addr, &m->out_addr, 4); + rmp->out_plen = m->out_plen; + rmp->sharing_ratio = htonl (m->sharing_ratio); + rmp->ports_per_host = htons (m->ports_per_host); + rmp->ses_num = htonl (m->ses_num); + rmp->context = context; + + vl_msg_api_send_shmem (q, (u8 *) & rmp); +} + +static void +vl_api_snat_det_map_dump_t_handler (vl_api_snat_det_map_dump_t * mp) +{ + unix_shared_memory_queue_t *q; + snat_main_t *sm = &snat_main; + snat_det_map_t *m; + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (q == 0) + return; + + /* *INDENT-OFF* */ + vec_foreach(m, sm->det_maps) + sent_snat_det_map_details(m, q, mp->context); + /* *INDENT-ON* */ +} + +static void *vl_api_snat_det_map_dump_t_print + (vl_api_snat_det_map_dump_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: snat_det_map_dump "); + + FINISH; +} + +static void +vl_api_snat_det_set_timeouts_t_handler (vl_api_snat_det_set_timeouts_t * mp) +{ + snat_main_t *sm = &snat_main; + vl_api_snat_det_set_timeouts_reply_t *rmp; + int rv = 0; + + sm->udp_timeout = ntohl (mp->udp); + sm->tcp_established_timeout = ntohl (mp->tcp_established); + sm->tcp_transitory_timeout = ntohl (mp->tcp_transitory); + sm->icmp_timeout = ntohl (mp->icmp); + + REPLY_MACRO (VL_API_SNAT_DET_SET_TIMEOUTS_REPLY); +} + +static void *vl_api_snat_det_set_timeouts_t_print + (vl_api_snat_det_set_timeouts_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: snat_det_set_timeouts "); + s = format (s, "udp %d tcp_established %d tcp_transitory %d icmp %d\n", + ntohl (mp->udp), + ntohl (mp->tcp_established), + ntohl (mp->tcp_transitory), ntohl (mp->icmp)); + + FINISH; +} + +static void +vl_api_snat_det_get_timeouts_t_handler (vl_api_snat_det_get_timeouts_t * mp) +{ + snat_main_t *sm = &snat_main; + vl_api_snat_det_get_timeouts_reply_t *rmp; + int rv = 0; + + /* *INDENT-OFF* */ + REPLY_MACRO2 (VL_API_SNAT_DET_GET_TIMEOUTS_REPLY, + ({ + rmp->udp = htonl (sm->udp_timeout); + rmp->tcp_established = htonl (sm->tcp_established_timeout); + rmp->tcp_transitory = htonl (sm->tcp_transitory_timeout); + rmp->icmp = htonl (sm->icmp_timeout); + })) + /* *INDENT-ON* */ +} + +static void *vl_api_snat_det_get_timeouts_t_print + (vl_api_snat_det_get_timeouts_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: snat_det_get_timeouts"); + + FINISH; +} + +static void + vl_api_snat_det_close_session_out_t_handler + (vl_api_snat_det_close_session_out_t * mp) +{ + snat_main_t *sm = &snat_main; + vl_api_snat_det_close_session_out_reply_t *rmp; + ip4_address_t out_addr, ext_addr, in_addr; + snat_det_out_key_t key; + snat_det_map_t *dm; + snat_det_session_t *ses; + int rv = 0; + + clib_memcpy (&out_addr, mp->out_addr, 4); + clib_memcpy (&ext_addr, mp->ext_addr, 4); + + dm = snat_det_map_by_out (sm, &out_addr); + if (!dm) + { + rv = VNET_API_ERROR_NO_SUCH_ENTRY; + goto send_reply; + } + snat_det_reverse (dm, &ext_addr, ntohs (mp->out_port), &in_addr); + key.ext_host_addr = ext_addr; + key.ext_host_port = mp->ext_port; + key.out_port = mp->out_port; + ses = snat_det_get_ses_by_out (dm, &in_addr, key.as_u64); + if (!ses) + { + rv = VNET_API_ERROR_NO_SUCH_ENTRY; + goto send_reply; + } + snat_det_ses_close (dm, ses); + +send_reply: + REPLY_MACRO (VL_API_SNAT_DET_CLOSE_SESSION_OUT_REPLY); +} + +static void *vl_api_snat_det_close_session_out_t_print + (vl_api_snat_det_close_session_out_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: snat_det_close_session_out "); + s = format (s, "out_addr %U out_port %d " + "ext_addr %U ext_port %d\n", + format_ip4_address, mp->out_addr, ntohs (mp->out_port), + format_ip4_address, mp->ext_addr, ntohs (mp->ext_port)); + + FINISH; +} + +static void + vl_api_snat_det_close_session_in_t_handler + (vl_api_snat_det_close_session_in_t * mp) +{ + snat_main_t *sm = &snat_main; + vl_api_snat_det_close_session_in_reply_t *rmp; + ip4_address_t in_addr, ext_addr; + snat_det_out_key_t key; + snat_det_map_t *dm; + snat_det_session_t *ses; + int rv = 0; + + clib_memcpy (&in_addr, mp->in_addr, 4); + clib_memcpy (&ext_addr, mp->ext_addr, 4); + + dm = snat_det_map_by_user (sm, &in_addr); + if (!dm) + { + rv = VNET_API_ERROR_NO_SUCH_ENTRY; + goto send_reply; + } + key.ext_host_addr = ext_addr; + key.ext_host_port = mp->ext_port; + ses = snat_det_find_ses_by_in (dm, &in_addr, mp->in_port, key); + if (!ses) + { + rv = VNET_API_ERROR_NO_SUCH_ENTRY; + goto send_reply; + } + snat_det_ses_close (dm, ses); + +send_reply: + REPLY_MACRO (VL_API_SNAT_DET_CLOSE_SESSION_OUT_REPLY); +} + +static void *vl_api_snat_det_close_session_in_t_print + (vl_api_snat_det_close_session_in_t * mp, void *handle) +{ + u8 *s; + s = format (0, "SCRIPT: snat_det_close_session_in "); + s = format (s, "in_addr %U in_port %d " + "ext_addr %U ext_port %d\n", + format_ip4_address, mp->in_addr, ntohs (mp->in_port), + format_ip4_address, mp->ext_addr, ntohs (mp->ext_port)); + + FINISH; +} + +static void + send_snat_det_session_details + (snat_det_session_t * s, unix_shared_memory_queue_t * q, u32 context) +{ + vl_api_snat_det_session_details_t *rmp; + snat_main_t *sm = &snat_main; + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (VL_API_SNAT_DET_SESSION_DETAILS + sm->msg_id_base); + rmp->is_ip4 = 1; + rmp->in_port = s->in_port; + clib_memcpy (rmp->ext_addr, &s->out.ext_host_addr, 4); + rmp->ext_port = s->out.ext_host_port; + rmp->out_port = s->out.out_port; + rmp->state = s->state; + rmp->expire = ntohl (s->expire); + rmp->context = context; + + vl_msg_api_send_shmem (q, (u8 *) & rmp); +} + +static void +vl_api_snat_det_session_dump_t_handler (vl_api_snat_det_session_dump_t * mp) +{ + unix_shared_memory_queue_t *q; + snat_main_t *sm = &snat_main; + ip4_address_t user_addr; + snat_det_map_t *dm; + snat_det_session_t *s, empty_ses; + u16 i; + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (q == 0) + return; + if (!mp->is_ip4) + return; + + memset (&empty_ses, 0, sizeof (empty_ses)); + clib_memcpy (&user_addr, mp->user_addr, 4); + dm = snat_det_map_by_user (sm, &user_addr); + if (!dm) + return; + + s = dm->sessions + snat_det_user_ses_offset (&user_addr, dm->in_plen); + for (i = 0; i < SNAT_DET_SES_PER_USER; i++) + { + if (s->out.as_u64) + send_snat_det_session_details (s, q, mp->context); + s++; + } +} + +static void *vl_api_snat_det_session_dump_t_print + (vl_api_snat_det_session_dump_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: snat_det_session_dump "); + s = format (s, "user_addr %U\n", format_ip4_address, mp->user_addr); + + FINISH; +} + +/******************************/ +/*** Common NAT plugin APIs ***/ +/******************************/ + +static void +vl_api_nat_control_ping_t_handler (vl_api_nat_control_ping_t * mp) +{ + vl_api_nat_control_ping_reply_t *rmp; + snat_main_t *sm = &snat_main; + int rv = 0; + + /* *INDENT-OFF* */ + REPLY_MACRO2 (VL_API_NAT_CONTROL_PING_REPLY, + ({ + rmp->vpe_pid = ntohl (getpid ()); + })); + /* *INDENT-ON* */ +} + +static void * +vl_api_nat_control_ping_t_print (vl_api_nat_control_ping_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: nat_control_ping "); + + FINISH; +} + +static void +vl_api_nat_show_config_t_handler (vl_api_nat_show_config_t * mp) +{ + vl_api_nat_show_config_reply_t *rmp; + snat_main_t *sm = &snat_main; + int rv = 0; + + /* *INDENT-OFF* */ + REPLY_MACRO2 (VL_API_NAT_SHOW_CONFIG_REPLY, + ({ + rmp->translation_buckets = htonl (sm->translation_buckets); + rmp->translation_memory_size = htonl (sm->translation_memory_size); + rmp->user_buckets = htonl (sm->user_buckets); + rmp->user_memory_size = htonl (sm->user_memory_size); + rmp->max_translations_per_user = htonl (sm->max_translations_per_user); + rmp->outside_vrf_id = htonl (sm->outside_vrf_id); + rmp->inside_vrf_id = htonl (sm->inside_vrf_id); + rmp->static_mapping_only = sm->static_mapping_only; + rmp->static_mapping_connection_tracking = + sm->static_mapping_connection_tracking; + rmp->deterministic = sm->deterministic; + })); + /* *INDENT-ON* */ +} + +static void * +vl_api_nat_show_config_t_print (vl_api_nat_show_config_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: nat_show_config "); + + FINISH; +} + +static void +vl_api_nat_set_workers_t_handler (vl_api_nat_set_workers_t * mp) +{ + snat_main_t *sm = &snat_main; + vl_api_snat_set_workers_reply_t *rmp; + int rv = 0; + uword *bitmap = 0; + u64 mask = clib_net_to_host_u64 (mp->worker_mask); + + if (sm->num_workers < 2) + { + rv = VNET_API_ERROR_FEATURE_DISABLED; + goto send_reply; + } + + bitmap = clib_bitmap_set_multiple (bitmap, 0, mask, BITS (mask)); + rv = snat_set_workers (bitmap); + clib_bitmap_free (bitmap); + +send_reply: + REPLY_MACRO (VL_API_NAT_SET_WORKERS_REPLY); +} + +static void * +vl_api_nat_set_workers_t_print (vl_api_nat_set_workers_t * mp, void *handle) +{ + u8 *s; + uword *bitmap = 0; + u8 first = 1; + int i; + u64 mask = clib_net_to_host_u64 (mp->worker_mask); + + s = format (0, "SCRIPT: nat_set_workers "); + bitmap = clib_bitmap_set_multiple (bitmap, 0, mask, BITS (mask)); + /* *INDENT-OFF* */ + clib_bitmap_foreach (i, bitmap, + ({ + if (first) + s = format (s, "%d", i); + else + s = format (s, ",%d", i); + first = 0; + })); + /* *INDENT-ON* */ + clib_bitmap_free (bitmap); + FINISH; +} + +static void +send_nat_worker_details (u32 worker_index, unix_shared_memory_queue_t * q, + u32 context) +{ + vl_api_nat_worker_details_t *rmp; + snat_main_t *sm = &snat_main; + vlib_worker_thread_t *w = + vlib_worker_threads + worker_index + sm->first_worker_index; + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (VL_API_NAT_WORKER_DETAILS + sm->msg_id_base); + rmp->context = context; + rmp->worker_index = htonl (worker_index); + rmp->lcore_id = htonl (w->lcore_id); + strncpy ((char *) rmp->name, (char *) w->name, ARRAY_LEN (rmp->name) - 1); + + vl_msg_api_send_shmem (q, (u8 *) & rmp); +} + +static void +vl_api_nat_worker_dump_t_handler (vl_api_nat_worker_dump_t * mp) +{ + unix_shared_memory_queue_t *q; + snat_main_t *sm = &snat_main; + u32 *worker_index; + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (q == 0) + return; + + /* *INDENT-OFF* */ + vec_foreach (worker_index, sm->workers) + send_nat_worker_details(*worker_index, q, mp->context); + /* *INDENT-ON* */ +} + +static void * +vl_api_nat_worker_dump_t_print (vl_api_nat_worker_dump_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: nat_worker_dump "); + + FINISH; +} + +static void +vl_api_nat_ipfix_enable_disable_t_handler (vl_api_nat_ipfix_enable_disable_t * + mp) +{ + snat_main_t *sm = &snat_main; + vl_api_nat_ipfix_enable_disable_reply_t *rmp; + int rv = 0; + + rv = snat_ipfix_logging_enable_disable (mp->enable, + clib_host_to_net_u32 + (mp->domain_id), + clib_host_to_net_u16 + (mp->src_port)); + + REPLY_MACRO (VL_API_NAT_IPFIX_ENABLE_DISABLE_REPLY); +} + +static void * +vl_api_nat_ipfix_enable_disable_t_print (vl_api_nat_ipfix_enable_disable_t * + mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: nat_ipfix_enable_disable "); + if (mp->domain_id) + s = format (s, "domain %d ", clib_net_to_host_u32 (mp->domain_id)); + if (mp->src_port) + s = format (s, "src_port %d ", clib_net_to_host_u16 (mp->src_port)); + if (!mp->enable) + s = format (s, "disable "); + + FINISH; +} + +/*************/ +/*** NAT44 ***/ +/*************/ +static void + vl_api_nat44_add_del_address_range_t_handler + (vl_api_nat44_add_del_address_range_t * mp) +{ + snat_main_t *sm = &snat_main; + vl_api_nat44_add_del_address_range_reply_t *rmp; + ip4_address_t this_addr; + u32 start_host_order, end_host_order; + u32 vrf_id; + int i, count; + int rv = 0; + u32 *tmp; + + if (sm->static_mapping_only) + { + rv = VNET_API_ERROR_FEATURE_DISABLED; + goto send_reply; + } + + tmp = (u32 *) mp->first_ip_address; + start_host_order = clib_host_to_net_u32 (tmp[0]); + tmp = (u32 *) mp->last_ip_address; + end_host_order = clib_host_to_net_u32 (tmp[0]); + + count = (end_host_order - start_host_order) + 1; + + vrf_id = clib_host_to_net_u32 (mp->vrf_id); + + if (count > 1024) + clib_warning ("%U - %U, %d addresses...", + format_ip4_address, mp->first_ip_address, + format_ip4_address, mp->last_ip_address, count); + + memcpy (&this_addr.as_u8, mp->first_ip_address, 4); + + for (i = 0; i < count; i++) + { + if (mp->is_add) + snat_add_address (sm, &this_addr, vrf_id); + else + rv = snat_del_address (sm, this_addr, 0); + + if (rv) + goto send_reply; + + increment_v4_address (&this_addr); + } + +send_reply: + REPLY_MACRO (VL_API_NAT44_ADD_DEL_ADDRESS_RANGE_REPLY); +} + +static void *vl_api_nat44_add_del_address_range_t_print + (vl_api_nat44_add_del_address_range_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: nat44_add_address_range "); + s = format (s, "%U ", format_ip4_address, mp->first_ip_address); + if (memcmp (mp->first_ip_address, mp->last_ip_address, 4)) + { + s = format (s, " - %U ", format_ip4_address, mp->last_ip_address); + } + FINISH; +} + +static void +send_nat44_address_details (snat_address_t * a, + unix_shared_memory_queue_t * q, u32 context) +{ + vl_api_nat44_address_details_t *rmp; + snat_main_t *sm = &snat_main; + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (VL_API_NAT44_ADDRESS_DETAILS + sm->msg_id_base); + clib_memcpy (rmp->ip_address, &(a->addr), 4); + if (a->fib_index != ~0) + { + fib_table_t *fib = fib_table_get (a->fib_index, FIB_PROTOCOL_IP4); + rmp->vrf_id = ntohl (fib->ft_table_id); + } + else + rmp->vrf_id = ~0; + rmp->context = context; + + vl_msg_api_send_shmem (q, (u8 *) & rmp); +} + +static void +vl_api_nat44_address_dump_t_handler (vl_api_nat44_address_dump_t * mp) +{ + unix_shared_memory_queue_t *q; + snat_main_t *sm = &snat_main; + snat_address_t *a; + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (q == 0) + return; + + /* *INDENT-OFF* */ + vec_foreach (a, sm->addresses) + send_nat44_address_details (a, q, mp->context); + /* *INDENT-ON* */ +} + +static void * +vl_api_nat44_address_dump_t_print (vl_api_nat44_address_dump_t * mp, + void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: nat44_address_dump "); + + FINISH; +} + +static void + vl_api_nat44_interface_add_del_feature_t_handler + (vl_api_nat44_interface_add_del_feature_t * mp) +{ + snat_main_t *sm = &snat_main; + vl_api_nat44_interface_add_del_feature_reply_t *rmp; + u8 is_del = mp->is_add == 0; + u32 sw_if_index = ntohl (mp->sw_if_index); + int rv = 0; + + VALIDATE_SW_IF_INDEX (mp); + + rv = snat_interface_add_del (sw_if_index, mp->is_inside, is_del); + + BAD_SW_IF_INDEX_LABEL; + + REPLY_MACRO (VL_API_NAT44_INTERFACE_ADD_DEL_FEATURE_REPLY); +} + +static void *vl_api_nat44_interface_add_del_feature_t_print + (vl_api_nat44_interface_add_del_feature_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: nat44_interface_add_del_feature "); + s = format (s, "sw_if_index %d %s %s", + clib_host_to_net_u32 (mp->sw_if_index), + mp->is_inside ? "in" : "out", mp->is_add ? "" : "del"); + + FINISH; +} + +static void +send_nat44_interface_details (snat_interface_t * i, + unix_shared_memory_queue_t * q, u32 context) +{ + vl_api_nat44_interface_details_t *rmp; + snat_main_t *sm = &snat_main; + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (VL_API_NAT44_INTERFACE_DETAILS + sm->msg_id_base); + rmp->sw_if_index = ntohl (i->sw_if_index); + rmp->is_inside = i->is_inside; + rmp->context = context; + + vl_msg_api_send_shmem (q, (u8 *) & rmp); +} + +static void +vl_api_nat44_interface_dump_t_handler (vl_api_nat44_interface_dump_t * mp) +{ + unix_shared_memory_queue_t *q; + snat_main_t *sm = &snat_main; + snat_interface_t *i; + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (q == 0) + return; + + /* *INDENT-OFF* */ + pool_foreach (i, sm->interfaces, + ({ + send_nat44_interface_details(i, q, mp->context); + })); + /* *INDENT-ON* */ +} + +static void * +vl_api_nat44_interface_dump_t_print (vl_api_nat44_interface_dump_t * mp, + void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: nat44_interface_dump "); + + FINISH; +} + +static void + vl_api_nat44_interface_add_del_output_feature_t_handler + (vl_api_nat44_interface_add_del_output_feature_t * mp) +{ + snat_main_t *sm = &snat_main; + vl_api_nat44_interface_add_del_output_feature_reply_t *rmp; + u8 is_del = mp->is_add == 0; + u32 sw_if_index = ntohl (mp->sw_if_index); + int rv = 0; + + VALIDATE_SW_IF_INDEX (mp); + + rv = snat_interface_add_del_output_feature (sw_if_index, mp->is_inside, + is_del); + + BAD_SW_IF_INDEX_LABEL; + + REPLY_MACRO (VL_API_NAT44_INTERFACE_ADD_DEL_OUTPUT_FEATURE_REPLY); +} + +static void *vl_api_nat44_interface_add_del_output_feature_t_print + (vl_api_nat44_interface_add_del_output_feature_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: nat44_interface_add_del_output_feature "); + s = format (s, "sw_if_index %d %s %s", + clib_host_to_net_u32 (mp->sw_if_index), + mp->is_inside ? "in" : "out", mp->is_add ? "" : "del"); + + FINISH; +} + +static void +send_nat44_interface_output_feature_details (snat_interface_t * i, + unix_shared_memory_queue_t * q, + u32 context) +{ + vl_api_nat44_interface_output_feature_details_t *rmp; + snat_main_t *sm = &snat_main; + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = + ntohs (VL_API_NAT44_INTERFACE_OUTPUT_FEATURE_DETAILS + sm->msg_id_base); + rmp->sw_if_index = ntohl (i->sw_if_index); + rmp->context = context; + rmp->is_inside = i->is_inside; + + vl_msg_api_send_shmem (q, (u8 *) & rmp); +} + +static void + vl_api_nat44_interface_output_feature_dump_t_handler + (vl_api_nat44_interface_output_feature_dump_t * mp) +{ + unix_shared_memory_queue_t *q; + snat_main_t *sm = &snat_main; + snat_interface_t *i; + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (q == 0) + return; + + /* *INDENT-OFF* */ + pool_foreach (i, sm->output_feature_interfaces, + ({ + send_nat44_interface_output_feature_details(i, q, mp->context); + })); + /* *INDENT-ON* */ +} + +static void *vl_api_nat44_interface_output_feature_dump_t_print + (vl_api_nat44_interface_output_feature_dump_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: nat44_interface_output_feature_dump "); + + FINISH; +} + +static void + vl_api_nat44_add_del_static_mapping_t_handler + (vl_api_nat44_add_del_static_mapping_t * mp) +{ + snat_main_t *sm = &snat_main; + vl_api_nat44_add_del_static_mapping_reply_t *rmp; + ip4_address_t local_addr, external_addr; + u16 local_port = 0, external_port = 0; + u32 vrf_id, external_sw_if_index; + int rv = 0; + snat_protocol_t proto; + + memcpy (&local_addr.as_u8, mp->local_ip_address, 4); + memcpy (&external_addr.as_u8, mp->external_ip_address, 4); + if (mp->addr_only == 0) + { + local_port = clib_net_to_host_u16 (mp->local_port); + external_port = clib_net_to_host_u16 (mp->external_port); + } + vrf_id = clib_net_to_host_u32 (mp->vrf_id); + external_sw_if_index = clib_net_to_host_u32 (mp->external_sw_if_index); + proto = ip_proto_to_snat_proto (mp->protocol); + + rv = snat_add_static_mapping (local_addr, external_addr, local_port, + external_port, vrf_id, mp->addr_only, + external_sw_if_index, proto, mp->is_add); + + REPLY_MACRO (VL_API_NAT44_ADD_DEL_STATIC_MAPPING_REPLY); +} + +static void *vl_api_nat44_add_del_static_mapping_t_print + (vl_api_nat44_add_del_static_mapping_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: nat44_add_del_static_mapping "); + s = format (s, "protocol %d local_addr %U external_addr %U ", + mp->protocol, + format_ip4_address, mp->local_ip_address, + format_ip4_address, mp->external_ip_address); + + if (mp->addr_only == 0) + s = format (s, "local_port %d external_port %d ", + clib_net_to_host_u16 (mp->local_port), + clib_net_to_host_u16 (mp->external_port)); + + if (mp->vrf_id != ~0) + s = format (s, "vrf %d", clib_net_to_host_u32 (mp->vrf_id)); + + if (mp->external_sw_if_index != ~0) + s = format (s, "external_sw_if_index %d", + clib_net_to_host_u32 (mp->external_sw_if_index)); + FINISH; +} + +static void +send_nat44_static_mapping_details (snat_static_mapping_t * m, + unix_shared_memory_queue_t * q, + u32 context) +{ + vl_api_nat44_static_mapping_details_t *rmp; + snat_main_t *sm = &snat_main; + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = + ntohs (VL_API_NAT44_STATIC_MAPPING_DETAILS + sm->msg_id_base); + rmp->addr_only = m->addr_only; + clib_memcpy (rmp->local_ip_address, &(m->local_addr), 4); + clib_memcpy (rmp->external_ip_address, &(m->external_addr), 4); + rmp->local_port = htons (m->local_port); + rmp->external_port = htons (m->external_port); + rmp->external_sw_if_index = ~0; + rmp->vrf_id = htonl (m->vrf_id); + rmp->protocol = snat_proto_to_ip_proto (m->proto); + rmp->context = context; + + vl_msg_api_send_shmem (q, (u8 *) & rmp); +} + +static void +send_nat44_static_map_resolve_details (snat_static_map_resolve_t * m, + unix_shared_memory_queue_t * q, + u32 context) +{ + vl_api_nat44_static_mapping_details_t *rmp; + snat_main_t *sm = &snat_main; + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = + ntohs (VL_API_NAT44_STATIC_MAPPING_DETAILS + sm->msg_id_base); + rmp->addr_only = m->addr_only; + clib_memcpy (rmp->local_ip_address, &(m->l_addr), 4); + rmp->local_port = htons (m->l_port); + rmp->external_port = htons (m->e_port); + rmp->external_sw_if_index = htonl (m->sw_if_index); + rmp->vrf_id = htonl (m->vrf_id); + rmp->protocol = snat_proto_to_ip_proto (m->proto); + rmp->context = context; + + vl_msg_api_send_shmem (q, (u8 *) & rmp); +} + +static void +vl_api_nat44_static_mapping_dump_t_handler (vl_api_nat44_static_mapping_dump_t + * mp) +{ + unix_shared_memory_queue_t *q; + snat_main_t *sm = &snat_main; + snat_static_mapping_t *m; + snat_static_map_resolve_t *rp; + int j; + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (q == 0) + return; + + /* *INDENT-OFF* */ + pool_foreach (m, sm->static_mappings, + ({ + if (!vec_len(m->locals)) + send_nat44_static_mapping_details (m, q, mp->context); + })); + /* *INDENT-ON* */ + + for (j = 0; j < vec_len (sm->to_resolve); j++) + { + rp = sm->to_resolve + j; + send_nat44_static_map_resolve_details (rp, q, mp->context); + } +} + +static void * +vl_api_nat44_static_mapping_dump_t_print (vl_api_nat44_static_mapping_dump_t * + mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: nat44_static_mapping_dump "); + + FINISH; +} + +static void + vl_api_nat44_add_del_interface_addr_t_handler + (vl_api_nat44_add_del_interface_addr_t * mp) +{ + snat_main_t *sm = &snat_main; + vl_api_nat44_add_del_interface_addr_reply_t *rmp; + u8 is_del = mp->is_add == 0; + u32 sw_if_index = ntohl (mp->sw_if_index); + int rv = 0; + + VALIDATE_SW_IF_INDEX (mp); + + rv = snat_add_interface_address (sm, sw_if_index, is_del); + + BAD_SW_IF_INDEX_LABEL; + + REPLY_MACRO (VL_API_NAT44_ADD_DEL_INTERFACE_ADDR_REPLY); +} + +static void *vl_api_nat44_add_del_interface_addr_t_print + (vl_api_nat44_add_del_interface_addr_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: nat44_add_del_interface_addr "); + s = format (s, "sw_if_index %d %s", + clib_host_to_net_u32 (mp->sw_if_index), + mp->is_add ? "" : "del"); + + FINISH; +} + +static void +send_nat44_interface_addr_details (u32 sw_if_index, + unix_shared_memory_queue_t * q, + u32 context) +{ + vl_api_nat44_interface_addr_details_t *rmp; + snat_main_t *sm = &snat_main; + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = + ntohs (VL_API_NAT44_INTERFACE_ADDR_DETAILS + sm->msg_id_base); + rmp->sw_if_index = ntohl (sw_if_index); + rmp->context = context; + + vl_msg_api_send_shmem (q, (u8 *) & rmp); +} + +static void +vl_api_nat44_interface_addr_dump_t_handler (vl_api_nat44_interface_addr_dump_t + * mp) +{ + unix_shared_memory_queue_t *q; + snat_main_t *sm = &snat_main; + u32 *i; + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (q == 0) + return; + + /* *INDENT-OFF* */ + vec_foreach (i, sm->auto_add_sw_if_indices) + send_nat44_interface_addr_details(*i, q, mp->context); + /* *INDENT-ON* */ +} + +static void * +vl_api_nat44_interface_addr_dump_t_print (vl_api_nat44_interface_addr_dump_t * + mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: nat44_interface_addr_dump "); + + FINISH; +} + +static void +send_nat44_user_details (snat_user_t * u, unix_shared_memory_queue_t * q, + u32 context) +{ + vl_api_nat44_user_details_t *rmp; + snat_main_t *sm = &snat_main; + fib_table_t *fib = fib_table_get (u->fib_index, FIB_PROTOCOL_IP4); + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (VL_API_NAT44_USER_DETAILS + sm->msg_id_base); + + rmp->vrf_id = ntohl (fib->ft_table_id); + + clib_memcpy (rmp->ip_address, &(u->addr), 4); + rmp->nsessions = ntohl (u->nsessions); + rmp->nstaticsessions = ntohl (u->nstaticsessions); + rmp->context = context; + + vl_msg_api_send_shmem (q, (u8 *) & rmp); +} + +static void +vl_api_nat44_user_dump_t_handler (vl_api_nat44_user_dump_t * mp) +{ + unix_shared_memory_queue_t *q; + snat_main_t *sm = &snat_main; + snat_main_per_thread_data_t *tsm; + snat_user_t *u; + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (q == 0) + return; + + /* *INDENT-OFF* */ + vec_foreach (tsm, sm->per_thread_data) + vec_foreach (u, tsm->users) + send_nat44_user_details (u, q, mp->context); + /* *INDENT-ON* */ +} + +static void * +vl_api_nat44_user_dump_t_print (vl_api_nat44_user_dump_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: nat44_user_dump "); + + FINISH; +} + +static void +send_nat44_user_session_details (snat_session_t * s, + unix_shared_memory_queue_t * q, u32 context) +{ + vl_api_nat44_user_session_details_t *rmp; + snat_main_t *sm = &snat_main; + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = + ntohs (VL_API_NAT44_USER_SESSION_DETAILS + sm->msg_id_base); + clib_memcpy (rmp->outside_ip_address, (&s->out2in.addr), 4); + clib_memcpy (rmp->inside_ip_address, (&s->in2out.addr), 4); + rmp->is_static = s->flags & SNAT_SESSION_FLAG_STATIC_MAPPING ? 1 : 0; + rmp->last_heard = clib_host_to_net_u64 ((u64) s->last_heard); + rmp->total_bytes = clib_host_to_net_u64 (s->total_bytes); + rmp->total_pkts = ntohl (s->total_pkts); + rmp->context = context; + if (snat_is_unk_proto_session (s)) + { + rmp->outside_port = 0; + rmp->inside_port = 0; + rmp->protocol = ntohs (s->in2out.port); + } + else + { + rmp->outside_port = s->out2in.port; + rmp->inside_port = s->in2out.port; + rmp->protocol = ntohs (snat_proto_to_ip_proto (s->in2out.protocol)); + } + + vl_msg_api_send_shmem (q, (u8 *) & rmp); +} + +static void +vl_api_nat44_user_session_dump_t_handler (vl_api_nat44_user_session_dump_t * + mp) +{ + unix_shared_memory_queue_t *q; + snat_main_t *sm = &snat_main; + snat_main_per_thread_data_t *tsm; + snat_session_t *s; + clib_bihash_kv_8_8_t key, value; + snat_user_key_t ukey; + snat_user_t *u; + u32 session_index, head_index, elt_index; + dlist_elt_t *head, *elt; + ip4_header_t ip; + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (q == 0) + return; + + clib_memcpy (&ukey.addr, mp->ip_address, 4); + ip.src_address.as_u32 = ukey.addr.as_u32; + ukey.fib_index = fib_table_find (FIB_PROTOCOL_IP4, ntohl (mp->vrf_id)); + key.key = ukey.as_u64; + if (sm->num_workers) + tsm = + vec_elt_at_index (sm->per_thread_data, + sm->worker_in2out_cb (&ip, ukey.fib_index)); + else + tsm = vec_elt_at_index (sm->per_thread_data, sm->num_workers); + if (clib_bihash_search_8_8 (&tsm->user_hash, &key, &value)) + return; + u = pool_elt_at_index (tsm->users, value.value); + if (!u->nsessions && !u->nstaticsessions) + return; + + head_index = u->sessions_per_user_list_head_index; + head = pool_elt_at_index (tsm->list_pool, head_index); + elt_index = head->next; + elt = pool_elt_at_index (tsm->list_pool, elt_index); + session_index = elt->value; + while (session_index != ~0) + { + s = pool_elt_at_index (tsm->sessions, session_index); + + send_nat44_user_session_details (s, q, mp->context); + + elt_index = elt->next; + elt = pool_elt_at_index (tsm->list_pool, elt_index); + session_index = elt->value; + } +} + +static void * +vl_api_nat44_user_session_dump_t_print (vl_api_nat44_user_session_dump_t * mp, + void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: nat44_user_session_dump "); + s = format (s, "ip_address %U vrf_id %d\n", + format_ip4_address, mp->ip_address, + clib_net_to_host_u32 (mp->vrf_id)); + + FINISH; +} + +static nat44_lb_addr_port_t * +unformat_nat44_lb_addr_port (vl_api_nat44_lb_addr_port_t * addr_port_pairs, + u8 addr_port_pair_num) +{ + u8 i; + nat44_lb_addr_port_t *lb_addr_port_pairs = 0, lb_addr_port; + vl_api_nat44_lb_addr_port_t *ap; + + for (i = 0; i < addr_port_pair_num; i++) + { + ap = &addr_port_pairs[i]; + memset (&lb_addr_port, 0, sizeof (lb_addr_port)); + clib_memcpy (&lb_addr_port.addr, ap->addr, 4); + lb_addr_port.port = clib_net_to_host_u16 (ap->port); + lb_addr_port.probability = ap->probability; + vec_add1 (lb_addr_port_pairs, lb_addr_port); + } + + return lb_addr_port_pairs; +} + +static void + vl_api_nat44_add_del_lb_static_mapping_t_handler + (vl_api_nat44_add_del_lb_static_mapping_t * mp) +{ + snat_main_t *sm = &snat_main; + vl_api_nat44_add_del_lb_static_mapping_reply_t *rmp; + int rv = 0; + nat44_lb_addr_port_t *locals = 0; + ip4_address_t e_addr; + snat_protocol_t proto; + + locals = unformat_nat44_lb_addr_port (mp->locals, mp->local_num); + clib_memcpy (&e_addr, mp->external_addr, 4); + proto = ip_proto_to_snat_proto (mp->protocol); + + rv = + nat44_add_del_lb_static_mapping (e_addr, + clib_net_to_host_u16 (mp->external_port), + proto, clib_net_to_host_u32 (mp->vrf_id), + locals, mp->is_add); + + vec_free (locals); + + REPLY_MACRO (VL_API_NAT44_ADD_DEL_LB_STATIC_MAPPING_REPLY); +} + +static void *vl_api_nat44_add_del_lb_static_mapping_t_print + (vl_api_nat44_add_del_lb_static_mapping_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: nat44_add_del_lb_static_mapping "); + s = format (s, "is_add %d\n", mp->is_add); + + FINISH; +} + +static void +send_nat44_lb_static_mapping_details (snat_static_mapping_t * m, + unix_shared_memory_queue_t * q, + u32 context) +{ + vl_api_nat44_lb_static_mapping_details_t *rmp; + snat_main_t *sm = &snat_main; + nat44_lb_addr_port_t *ap; + vl_api_nat44_lb_addr_port_t *locals; + + rmp = + vl_msg_api_alloc (sizeof (*rmp) + + (vec_len (m->locals) * sizeof (nat44_lb_addr_port_t))); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = + ntohs (VL_API_NAT44_LB_STATIC_MAPPING_DETAILS + sm->msg_id_base); + + clib_memcpy (rmp->external_addr, &(m->external_addr), 4); + rmp->external_port = ntohs (m->external_port); + rmp->protocol = snat_proto_to_ip_proto (m->proto); + rmp->vrf_id = ntohl (m->vrf_id); + rmp->context = context; + + locals = (vl_api_nat44_lb_addr_port_t *) rmp->locals; + vec_foreach (ap, m->locals) + { + clib_memcpy (locals->addr, &(ap->addr), 4); + locals->port = htons (ap->port); + locals->probability = ap->probability; + locals++; + rmp->local_num++; + } + + vl_msg_api_send_shmem (q, (u8 *) & rmp); +} + +static void + vl_api_nat44_lb_static_mapping_dump_t_handler + (vl_api_nat44_lb_static_mapping_dump_t * mp) +{ + unix_shared_memory_queue_t *q; + snat_main_t *sm = &snat_main; + snat_static_mapping_t *m; + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (q == 0) + return; + + /* *INDENT-OFF* */ + pool_foreach (m, sm->static_mappings, + ({ + if (vec_len(m->locals)) + send_nat44_lb_static_mapping_details (m, q, mp->context); + })); + /* *INDENT-ON* */ +} + +static void *vl_api_nat44_lb_static_mapping_dump_t_print + (vl_api_nat44_lb_static_mapping_dump_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: nat44_lb_static_mapping_dump "); + + FINISH; +} + +/*******************************/ +/*** Deterministic NAT (CGN) ***/ +/*******************************/ + +static void +vl_api_nat_det_add_del_map_t_handler (vl_api_nat_det_add_del_map_t * mp) +{ + snat_main_t *sm = &snat_main; + vl_api_nat_det_add_del_map_reply_t *rmp; + int rv = 0; + ip4_address_t in_addr, out_addr; + + if (!mp->is_nat44) + { + rv = VNET_API_ERROR_UNIMPLEMENTED; + goto send_reply; + } + + clib_memcpy (&in_addr, mp->in_addr, 4); + clib_memcpy (&out_addr, mp->out_addr, 4); + rv = snat_det_add_map (sm, &in_addr, mp->in_plen, &out_addr, + mp->out_plen, mp->is_add); + +send_reply: + REPLY_MACRO (VL_API_NAT_DET_ADD_DEL_MAP_REPLY); +} + +static void * +vl_api_nat_det_add_del_map_t_print (vl_api_nat_det_add_del_map_t * mp, + void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: nat_det_add_del_map "); + s = format (s, "inside address %U/%d outside address %U/%d\n", + format_ip4_address, mp->in_addr, mp->in_plen, + format_ip4_address, mp->out_addr, mp->out_plen); + + FINISH; +} + +static void +vl_api_nat_det_forward_t_handler (vl_api_nat_det_forward_t * mp) +{ + snat_main_t *sm = &snat_main; + vl_api_nat_det_forward_reply_t *rmp; + int rv = 0; + u16 lo_port = 0, hi_port = 0; + snat_det_map_t *dm; + ip4_address_t in_addr, out_addr; + + if (!mp->is_nat44) + { + out_addr.as_u32 = 0; + rv = VNET_API_ERROR_UNIMPLEMENTED; + goto send_reply; + } + + out_addr.as_u32 = 0; + clib_memcpy (&in_addr, mp->in_addr, 4); + dm = snat_det_map_by_user (sm, &in_addr); + if (!dm) + { + rv = VNET_API_ERROR_NO_SUCH_ENTRY; + goto send_reply; + } + + snat_det_forward (dm, &in_addr, &out_addr, &lo_port); + hi_port = lo_port + dm->ports_per_host - 1; + +send_reply: + /* *INDENT-OFF* */ + REPLY_MACRO2 (VL_API_NAT_DET_FORWARD_REPLY, + ({ + rmp->out_port_lo = ntohs (lo_port); + rmp->out_port_hi = ntohs (hi_port); + clib_memcpy (rmp->out_addr, &out_addr, 4); + })) + /* *INDENT-ON* */ +} + +static void * +vl_api_nat_det_forward_t_print (vl_api_nat_det_forward_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: nat_det_forward"); + s = format (s, "inside ip address %U\n", format_ip4_address, mp->in_addr); + + FINISH; +} + +static void +vl_api_nat_det_reverse_t_handler (vl_api_nat_det_reverse_t * mp) +{ + snat_main_t *sm = &snat_main; + vl_api_nat_det_reverse_reply_t *rmp; + int rv = 0; + ip4_address_t out_addr, in_addr; + snat_det_map_t *dm; + + in_addr.as_u32 = 0; + clib_memcpy (&out_addr, mp->out_addr, 4); + dm = snat_det_map_by_out (sm, &out_addr); + if (!dm) + { + rv = VNET_API_ERROR_NO_SUCH_ENTRY; + goto send_reply; + } + + snat_det_reverse (dm, &out_addr, htons (mp->out_port), &in_addr); + +send_reply: + /* *INDENT-OFF* */ + REPLY_MACRO2 (VL_API_NAT_DET_REVERSE_REPLY, + ({ + rmp->is_nat44 = 1; + memset (rmp->in_addr, 0, 16); + clib_memcpy (rmp->in_addr, &in_addr, 4); + })) + /* *INDENT-ON* */ +} + +static void * +vl_api_nat_det_reverse_t_print (vl_api_nat_det_reverse_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: nat_det_reverse"); + s = format (s, "outside ip address %U outside port %d", + format_ip4_address, mp->out_addr, ntohs (mp->out_port)); + + FINISH; +} + +static void +sent_nat_det_map_details (snat_det_map_t * m, unix_shared_memory_queue_t * q, + u32 context) +{ + vl_api_nat_det_map_details_t *rmp; + snat_main_t *sm = &snat_main; + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (VL_API_NAT_DET_MAP_DETAILS + sm->msg_id_base); + rmp->is_nat44 = 1; + clib_memcpy (rmp->in_addr, &m->in_addr, 4); + rmp->in_plen = m->in_plen; + clib_memcpy (rmp->out_addr, &m->out_addr, 4); + rmp->out_plen = m->out_plen; + rmp->sharing_ratio = htonl (m->sharing_ratio); + rmp->ports_per_host = htons (m->ports_per_host); + rmp->ses_num = htonl (m->ses_num); + rmp->context = context; + + vl_msg_api_send_shmem (q, (u8 *) & rmp); +} + +static void +vl_api_nat_det_map_dump_t_handler (vl_api_nat_det_map_dump_t * mp) +{ + unix_shared_memory_queue_t *q; + snat_main_t *sm = &snat_main; + snat_det_map_t *m; + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (q == 0) + return; + + /* *INDENT-OFF* */ + vec_foreach(m, sm->det_maps) + sent_nat_det_map_details(m, q, mp->context); + /* *INDENT-ON* */ +} + +static void * +vl_api_nat_det_map_dump_t_print (vl_api_nat_det_map_dump_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: nat_det_map_dump "); + + FINISH; +} + +static void +vl_api_nat_det_set_timeouts_t_handler (vl_api_nat_det_set_timeouts_t * mp) +{ + snat_main_t *sm = &snat_main; + vl_api_nat_det_set_timeouts_reply_t *rmp; + int rv = 0; + + sm->udp_timeout = ntohl (mp->udp); + sm->tcp_established_timeout = ntohl (mp->tcp_established); + sm->tcp_transitory_timeout = ntohl (mp->tcp_transitory); + sm->icmp_timeout = ntohl (mp->icmp); + + REPLY_MACRO (VL_API_NAT_DET_SET_TIMEOUTS_REPLY); +} + +static void * +vl_api_nat_det_set_timeouts_t_print (vl_api_nat_det_set_timeouts_t * mp, + void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: nat_det_set_timeouts "); + s = format (s, "udp %d tcp_established %d tcp_transitory %d icmp %d\n", + ntohl (mp->udp), + ntohl (mp->tcp_established), + ntohl (mp->tcp_transitory), ntohl (mp->icmp)); + + FINISH; +} + +static void +vl_api_nat_det_get_timeouts_t_handler (vl_api_nat_det_get_timeouts_t * mp) +{ + snat_main_t *sm = &snat_main; + vl_api_nat_det_get_timeouts_reply_t *rmp; + int rv = 0; + + /* *INDENT-OFF* */ + REPLY_MACRO2 (VL_API_NAT_DET_GET_TIMEOUTS_REPLY, + ({ + rmp->udp = htonl (sm->udp_timeout); + rmp->tcp_established = htonl (sm->tcp_established_timeout); + rmp->tcp_transitory = htonl (sm->tcp_transitory_timeout); + rmp->icmp = htonl (sm->icmp_timeout); + })) + /* *INDENT-ON* */ +} + +static void * +vl_api_nat_det_get_timeouts_t_print (vl_api_nat_det_get_timeouts_t * mp, + void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: nat_det_get_timeouts"); + + FINISH; +} + +static void +vl_api_nat_det_close_session_out_t_handler (vl_api_nat_det_close_session_out_t + * mp) +{ + snat_main_t *sm = &snat_main; + vl_api_nat_det_close_session_out_reply_t *rmp; + ip4_address_t out_addr, ext_addr, in_addr; + snat_det_out_key_t key; + snat_det_map_t *dm; + snat_det_session_t *ses; + int rv = 0; + + clib_memcpy (&out_addr, mp->out_addr, 4); + clib_memcpy (&ext_addr, mp->ext_addr, 4); + + dm = snat_det_map_by_out (sm, &out_addr); + if (!dm) + { + rv = VNET_API_ERROR_NO_SUCH_ENTRY; + goto send_reply; + } + snat_det_reverse (dm, &ext_addr, ntohs (mp->out_port), &in_addr); + key.ext_host_addr = ext_addr; + key.ext_host_port = mp->ext_port; + key.out_port = mp->out_port; + ses = snat_det_get_ses_by_out (dm, &in_addr, key.as_u64); + if (!ses) + { + rv = VNET_API_ERROR_NO_SUCH_ENTRY; + goto send_reply; + } + snat_det_ses_close (dm, ses); + +send_reply: + REPLY_MACRO (VL_API_NAT_DET_CLOSE_SESSION_OUT_REPLY); +} + +static void * +vl_api_nat_det_close_session_out_t_print (vl_api_nat_det_close_session_out_t * + mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: nat_det_close_session_out "); + s = format (s, "out_addr %U out_port %d " + "ext_addr %U ext_port %d\n", + format_ip4_address, mp->out_addr, ntohs (mp->out_port), + format_ip4_address, mp->ext_addr, ntohs (mp->ext_port)); + + FINISH; +} + +static void +vl_api_nat_det_close_session_in_t_handler (vl_api_nat_det_close_session_in_t * + mp) +{ + snat_main_t *sm = &snat_main; + vl_api_nat_det_close_session_in_reply_t *rmp; + ip4_address_t in_addr, ext_addr; + snat_det_out_key_t key; + snat_det_map_t *dm; + snat_det_session_t *ses; + int rv = 0; + + if (!mp->is_nat44) + { + rv = VNET_API_ERROR_UNIMPLEMENTED; + goto send_reply; + } + + clib_memcpy (&in_addr, mp->in_addr, 4); + clib_memcpy (&ext_addr, mp->ext_addr, 4); + + dm = snat_det_map_by_user (sm, &in_addr); + if (!dm) + { + rv = VNET_API_ERROR_NO_SUCH_ENTRY; + goto send_reply; + } + key.ext_host_addr = ext_addr; + key.ext_host_port = mp->ext_port; + ses = snat_det_find_ses_by_in (dm, &in_addr, mp->in_port, key); + if (!ses) + { + rv = VNET_API_ERROR_NO_SUCH_ENTRY; + goto send_reply; + } + snat_det_ses_close (dm, ses); + +send_reply: + REPLY_MACRO (VL_API_NAT_DET_CLOSE_SESSION_OUT_REPLY); +} + +static void * +vl_api_nat_det_close_session_in_t_print (vl_api_nat_det_close_session_in_t * + mp, void *handle) +{ + u8 *s; + s = format (0, "SCRIPT: nat_det_close_session_in "); + s = format (s, "in_addr %U in_port %d ext_addr %U ext_port %d\n", + format_ip4_address, mp->in_addr, ntohs (mp->in_port), + format_ip4_address, mp->ext_addr, ntohs (mp->ext_port)); + + FINISH; +} + +static void +send_nat_det_session_details (snat_det_session_t * s, + unix_shared_memory_queue_t * q, u32 context) +{ + vl_api_nat_det_session_details_t *rmp; + snat_main_t *sm = &snat_main; + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (VL_API_NAT_DET_SESSION_DETAILS + sm->msg_id_base); + rmp->in_port = s->in_port; + clib_memcpy (rmp->ext_addr, &s->out.ext_host_addr, 4); + rmp->ext_port = s->out.ext_host_port; + rmp->out_port = s->out.out_port; + rmp->state = s->state; + rmp->expire = ntohl (s->expire); + rmp->context = context; + + vl_msg_api_send_shmem (q, (u8 *) & rmp); +} + +static void +vl_api_nat_det_session_dump_t_handler (vl_api_nat_det_session_dump_t * mp) +{ + unix_shared_memory_queue_t *q; + snat_main_t *sm = &snat_main; + ip4_address_t user_addr; + snat_det_map_t *dm; + snat_det_session_t *s, empty_ses; + u16 i; + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (q == 0) + return; + if (!mp->is_nat44) + return; + + memset (&empty_ses, 0, sizeof (empty_ses)); + clib_memcpy (&user_addr, mp->user_addr, 4); + dm = snat_det_map_by_user (sm, &user_addr); + if (!dm) + return; + + s = dm->sessions + snat_det_user_ses_offset (&user_addr, dm->in_plen); + for (i = 0; i < SNAT_DET_SES_PER_USER; i++) + { + if (s->out.as_u64) + send_nat_det_session_details (s, q, mp->context); + s++; + } +} + +static void * +vl_api_nat_det_session_dump_t_print (vl_api_nat_det_session_dump_t * mp, + void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: nat_det_session_dump "); + s = format (s, "user_addr %U\n", format_ip4_address, mp->user_addr); + + FINISH; +} + +/*************/ +/*** NAT64 ***/ +/*************/ + +static void + vl_api_nat64_add_del_pool_addr_range_t_handler + (vl_api_nat64_add_del_pool_addr_range_t * mp) +{ + vl_api_nat64_add_del_pool_addr_range_reply_t *rmp; + snat_main_t *sm = &snat_main; + nat64_main_t *nm = &nat64_main; + int rv = 0; + ip4_address_t this_addr; + u32 start_host_order, end_host_order; + u32 vrf_id; + int i, count; + u32 *tmp; + + if (nm->is_disabled) + { + rv = VNET_API_ERROR_FEATURE_DISABLED; + goto send_reply; + } + + tmp = (u32 *) mp->start_addr; + start_host_order = clib_host_to_net_u32 (tmp[0]); + tmp = (u32 *) mp->end_addr; + end_host_order = clib_host_to_net_u32 (tmp[0]); + + count = (end_host_order - start_host_order) + 1; + + vrf_id = clib_host_to_net_u32 (mp->vrf_id); + + memcpy (&this_addr.as_u8, mp->start_addr, 4); + + for (i = 0; i < count; i++) + { + if ((rv = nat64_add_del_pool_addr (&this_addr, vrf_id, mp->is_add))) + goto send_reply; + + increment_v4_address (&this_addr); + } + +send_reply: + REPLY_MACRO (VL_API_NAT64_ADD_DEL_POOL_ADDR_RANGE_REPLY); +} + +static void *vl_api_nat64_add_del_pool_addr_range_t_print + (vl_api_nat64_add_del_pool_addr_range_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: nat64_add_del_pool_addr_range "); + s = format (s, "%U - %U vrf_id %u %s\n", + format_ip4_address, mp->start_addr, + format_ip4_address, mp->end_addr, + ntohl (mp->vrf_id), mp->is_add ? "" : "del"); + + FINISH; +} + +typedef struct nat64_api_walk_ctx_t_ +{ + unix_shared_memory_queue_t *q; + u32 context; +} nat64_api_walk_ctx_t; + +static int +nat64_api_pool_walk (snat_address_t * a, void *arg) +{ + vl_api_nat64_pool_addr_details_t *rmp; + snat_main_t *sm = &snat_main; + nat64_api_walk_ctx_t *ctx = arg; + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (VL_API_NAT64_POOL_ADDR_DETAILS + sm->msg_id_base); + clib_memcpy (rmp->address, &(a->addr), 4); + if (a->fib_index != ~0) + { + fib_table_t *fib = fib_table_get (a->fib_index, FIB_PROTOCOL_IP6); + if (!fib) + return -1; + rmp->vrf_id = ntohl (fib->ft_table_id); + } + else + rmp->vrf_id = ~0; + rmp->context = ctx->context; + + vl_msg_api_send_shmem (ctx->q, (u8 *) & rmp); + + return 0; +} + +static void +vl_api_nat64_pool_addr_dump_t_handler (vl_api_nat64_pool_addr_dump_t * mp) +{ + unix_shared_memory_queue_t *q; + nat64_main_t *nm = &nat64_main; + + if (nm->is_disabled) + return; + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (q == 0) + return; + + nat64_api_walk_ctx_t ctx = { + .q = q, + .context = mp->context, + }; + + nat64_pool_addr_walk (nat64_api_pool_walk, &ctx); +} + +static void * +vl_api_nat64_pool_addr_dump_t_print (vl_api_nat64_pool_addr_dump_t * mp, + void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: nat64_pool_addr_dump\n"); + + FINISH; +} + +static void +vl_api_nat64_add_del_interface_t_handler (vl_api_nat64_add_del_interface_t * + mp) +{ + snat_main_t *sm = &snat_main; + nat64_main_t *nm = &nat64_main; + vl_api_nat64_add_del_interface_reply_t *rmp; + int rv = 0; + + if (nm->is_disabled) + { + rv = VNET_API_ERROR_FEATURE_DISABLED; + goto send_reply; + } + + VALIDATE_SW_IF_INDEX (mp); + + rv = + nat64_add_del_interface (ntohl (mp->sw_if_index), mp->is_inside, + mp->is_add); + + BAD_SW_IF_INDEX_LABEL; + +send_reply: + REPLY_MACRO (VL_API_NAT64_ADD_DEL_INTERFACE_REPLY); +} + +static void * +vl_api_nat64_add_del_interface_t_print (vl_api_nat64_add_del_interface_t * mp, + void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: nat64_add_del_interface "); + s = format (s, "sw_if_index %d %s %s", + clib_host_to_net_u32 (mp->sw_if_index), + mp->is_inside ? "in" : "out", mp->is_add ? "" : "del"); + + FINISH; +} + +static int +nat64_api_interface_walk (snat_interface_t * i, void *arg) +{ + vl_api_nat64_interface_details_t *rmp; + snat_main_t *sm = &snat_main; + nat64_api_walk_ctx_t *ctx = arg; + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (VL_API_NAT64_INTERFACE_DETAILS + sm->msg_id_base); + rmp->sw_if_index = ntohl (i->sw_if_index); + rmp->is_inside = i->is_inside; + rmp->context = ctx->context; + + vl_msg_api_send_shmem (ctx->q, (u8 *) & rmp); + + return 0; +} + +static void +vl_api_nat64_interface_dump_t_handler (vl_api_nat64_interface_dump_t * mp) +{ + unix_shared_memory_queue_t *q; + nat64_main_t *nm = &nat64_main; + + if (nm->is_disabled) + return; + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (q == 0) + return; + + nat64_api_walk_ctx_t ctx = { + .q = q, + .context = mp->context, + }; + + nat64_interfaces_walk (nat64_api_interface_walk, &ctx); +} + +static void * +vl_api_nat64_interface_dump_t_print (vl_api_nat64_interface_dump_t * mp, + void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: snat_interface_dump "); + + FINISH; +} + +static void + vl_api_nat64_add_del_static_bib_t_handler + (vl_api_nat64_add_del_static_bib_t * mp) +{ + snat_main_t *sm = &snat_main; + nat64_main_t *nm = &nat64_main; + vl_api_nat64_add_del_static_bib_reply_t *rmp; + ip6_address_t in_addr; + ip4_address_t out_addr; + int rv = 0; + + if (nm->is_disabled) + { + rv = VNET_API_ERROR_FEATURE_DISABLED; + goto send_reply; + } + + memcpy (&in_addr.as_u8, mp->i_addr, 16); + memcpy (&out_addr.as_u8, mp->o_addr, 4); + + rv = + nat64_add_del_static_bib_entry (&in_addr, &out_addr, + clib_net_to_host_u16 (mp->i_port), + clib_net_to_host_u16 (mp->o_port), + mp->proto, + clib_net_to_host_u32 (mp->vrf_id), + mp->is_add); + +send_reply: + REPLY_MACRO (VL_API_NAT64_ADD_DEL_STATIC_BIB_REPLY); +} + +static void *vl_api_nat64_add_del_static_bib_t_print + (vl_api_nat64_add_del_static_bib_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: nat64_add_del_static_bib "); + s = format (s, "protocol %d i_addr %U o_addr %U ", + mp->proto, + format_ip6_address, mp->i_addr, format_ip4_address, mp->o_addr); + + if (mp->vrf_id != ~0) + s = format (s, "vrf %d", clib_net_to_host_u32 (mp->vrf_id)); + + FINISH; +} + +static int +nat64_api_bib_walk (nat64_db_bib_entry_t * bibe, void *arg) +{ + vl_api_nat64_bib_details_t *rmp; + snat_main_t *sm = &snat_main; + nat64_api_walk_ctx_t *ctx = arg; + fib_table_t *fib; + + fib = fib_table_get (bibe->fib_index, FIB_PROTOCOL_IP6); + if (!fib) + return -1; + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (VL_API_NAT64_BIB_DETAILS + sm->msg_id_base); + rmp->context = ctx->context; + clib_memcpy (rmp->i_addr, &(bibe->in_addr), 16); + clib_memcpy (rmp->o_addr, &(bibe->out_addr), 4); + rmp->i_port = bibe->in_port; + rmp->o_port = bibe->out_port; + rmp->vrf_id = ntohl (fib->ft_table_id); + rmp->proto = bibe->proto; + rmp->is_static = bibe->is_static; + rmp->ses_num = ntohl (bibe->ses_num); + + vl_msg_api_send_shmem (ctx->q, (u8 *) & rmp); + + return 0; +} + +static void +vl_api_nat64_bib_dump_t_handler (vl_api_nat64_bib_dump_t * mp) +{ + unix_shared_memory_queue_t *q; + nat64_main_t *nm = &nat64_main; + + if (nm->is_disabled) + return; + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (q == 0) + return; + + nat64_api_walk_ctx_t ctx = { + .q = q, + .context = mp->context, + }; + + nat64_db_bib_walk (&nm->db, mp->proto, nat64_api_bib_walk, &ctx); +} + +static void * +vl_api_nat64_bib_dump_t_print (vl_api_nat64_bib_dump_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: snat_bib_dump protocol %d", mp->proto); + + FINISH; +} + +static void +vl_api_nat64_set_timeouts_t_handler (vl_api_nat64_set_timeouts_t * mp) +{ + snat_main_t *sm = &snat_main; + nat64_main_t *nm = &nat64_main; + vl_api_nat64_set_timeouts_reply_t *rmp; + int rv = 0; + + if (nm->is_disabled) + { + rv = VNET_API_ERROR_FEATURE_DISABLED; + goto send_reply; + } + + rv = nat64_set_icmp_timeout (ntohl (mp->icmp)); + if (rv) + goto send_reply; + rv = nat64_set_udp_timeout (ntohl (mp->udp)); + if (rv) + goto send_reply; + rv = + nat64_set_tcp_timeouts (ntohl (mp->tcp_trans), ntohl (mp->tcp_est), + ntohl (mp->tcp_incoming_syn)); + +send_reply: + REPLY_MACRO (VL_API_NAT64_SET_TIMEOUTS_REPLY); +} + +static void *vl_api_nat64_set_timeouts_t_print + (vl_api_nat64_set_timeouts_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: nat64_set_timeouts "); + s = + format (s, + "udp %d icmp %d, tcp_trans %d, tcp_est %d, tcp_incoming_syn %d\n", + ntohl (mp->udp), ntohl (mp->icmp), ntohl (mp->tcp_trans), + ntohl (mp->tcp_est), ntohl (mp->tcp_incoming_syn)); + + FINISH; +} + +static void +vl_api_nat64_get_timeouts_t_handler (vl_api_nat64_get_timeouts_t * mp) +{ + snat_main_t *sm = &snat_main; + nat64_main_t *nm = &nat64_main; + vl_api_nat64_get_timeouts_reply_t *rmp; + int rv = 0; + + if (nm->is_disabled) + return; + + /* *INDENT-OFF* */ + REPLY_MACRO2 (VL_API_NAT64_GET_TIMEOUTS_REPLY, + ({ + rmp->udp = htonl (nat64_get_udp_timeout()); + rmp->icmp = htonl (nat64_get_icmp_timeout()); + rmp->tcp_trans = htonl (nat64_get_tcp_trans_timeout()); + rmp->tcp_est = htonl (nat64_get_tcp_est_timeout()); + rmp->tcp_incoming_syn = htonl (nat64_get_tcp_incoming_syn_timeout()); + })) + /* *INDENT-ON* */ +} + +static void *vl_api_nat64_get_timeouts_t_print + (vl_api_nat64_get_timeouts_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: nat64_get_timeouts"); + + FINISH; +} + +static int +nat64_api_st_walk (nat64_db_st_entry_t * ste, void *arg) +{ + vl_api_nat64_st_details_t *rmp; + snat_main_t *sm = &snat_main; + nat64_api_walk_ctx_t *ctx = arg; + nat64_main_t *nm = &nat64_main; + nat64_db_bib_entry_t *bibe; + fib_table_t *fib; + + bibe = nat64_db_bib_entry_by_index (&nm->db, ste->proto, ste->bibe_index); + if (!bibe) + return -1; + + fib = fib_table_get (bibe->fib_index, FIB_PROTOCOL_IP6); + if (!fib) + return -1; + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (VL_API_NAT64_ST_DETAILS + sm->msg_id_base); + rmp->context = ctx->context; + clib_memcpy (rmp->il_addr, &(bibe->in_addr), 16); + clib_memcpy (rmp->ol_addr, &(bibe->out_addr), 4); + rmp->il_port = bibe->in_port; + rmp->ol_port = bibe->out_port; + clib_memcpy (rmp->ir_addr, &(ste->in_r_addr), 16); + clib_memcpy (rmp->or_addr, &(ste->out_r_addr), 4); + rmp->il_port = ste->r_port; + rmp->vrf_id = ntohl (fib->ft_table_id); + rmp->proto = ste->proto; + + vl_msg_api_send_shmem (ctx->q, (u8 *) & rmp); + + return 0; +} + +static void +vl_api_nat64_st_dump_t_handler (vl_api_nat64_st_dump_t * mp) +{ + unix_shared_memory_queue_t *q; + nat64_main_t *nm = &nat64_main; + + if (nm->is_disabled) + return; + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (q == 0) + return; + + nat64_api_walk_ctx_t ctx = { + .q = q, + .context = mp->context, + }; + + nat64_db_st_walk (&nm->db, mp->proto, nat64_api_st_walk, &ctx); +} + +static void * +vl_api_nat64_st_dump_t_print (vl_api_nat64_st_dump_t * mp, void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: snat_st_dump protocol %d", mp->proto); + + FINISH; +} + +static void +vl_api_nat64_add_del_prefix_t_handler (vl_api_nat64_add_del_prefix_t * mp) +{ + vl_api_nat64_add_del_prefix_reply_t *rmp; + snat_main_t *sm = &snat_main; + nat64_main_t *nm = &nat64_main; + ip6_address_t prefix; + int rv = 0; + + if (nm->is_disabled) + { + rv = VNET_API_ERROR_FEATURE_DISABLED; + goto send_reply; + } + + memcpy (&prefix.as_u8, mp->prefix, 16); + + rv = + nat64_add_del_prefix (&prefix, mp->prefix_len, + clib_net_to_host_u32 (mp->vrf_id), mp->is_add); +send_reply: + REPLY_MACRO (VL_API_NAT64_ADD_DEL_PREFIX_REPLY); +} + +static void * +vl_api_nat64_add_del_prefix_t_print (vl_api_nat64_add_del_prefix_t * mp, + void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: nat64_add_del_prefix %U/%u vrf_id %u %s\n", + format_ip6_address, mp->prefix, mp->prefix_len, + ntohl (mp->vrf_id), mp->is_add ? "" : "del"); + + FINISH; +} + +static int +nat64_api_prefix_walk (nat64_prefix_t * p, void *arg) +{ + vl_api_nat64_prefix_details_t *rmp; + snat_main_t *sm = &snat_main; + nat64_api_walk_ctx_t *ctx = arg; + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (VL_API_NAT64_PREFIX_DETAILS + sm->msg_id_base); + clib_memcpy (rmp->prefix, &(p->prefix), 16); + rmp->prefix_len = p->plen; + rmp->vrf_id = ntohl (p->vrf_id); + rmp->context = ctx->context; + + vl_msg_api_send_shmem (ctx->q, (u8 *) & rmp); + + return 0; +} + +static void +vl_api_nat64_prefix_dump_t_handler (vl_api_nat64_prefix_dump_t * mp) +{ + unix_shared_memory_queue_t *q; + nat64_main_t *nm = &nat64_main; + + if (nm->is_disabled) + return; + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (q == 0) + return; + + nat64_api_walk_ctx_t ctx = { + .q = q, + .context = mp->context, + }; + + nat64_prefix_walk (nat64_api_prefix_walk, &ctx); +} + +static void * +vl_api_nat64_prefix_dump_t_print (vl_api_nat64_prefix_dump_t * mp, + void *handle) +{ + u8 *s; + + s = format (0, "SCRIPT: nat64_prefix_dump\n"); + + FINISH; +} + +/* List of message types that this plugin understands */ +#define foreach_snat_plugin_api_msg \ +_(SNAT_ADD_ADDRESS_RANGE, snat_add_address_range) \ +_(SNAT_INTERFACE_ADD_DEL_FEATURE, snat_interface_add_del_feature) \ +_(SNAT_ADD_STATIC_MAPPING, snat_add_static_mapping) \ +_(SNAT_CONTROL_PING, snat_control_ping) \ +_(SNAT_STATIC_MAPPING_DUMP, snat_static_mapping_dump) \ +_(SNAT_SHOW_CONFIG, snat_show_config) \ +_(SNAT_ADDRESS_DUMP, snat_address_dump) \ +_(SNAT_INTERFACE_DUMP, snat_interface_dump) \ +_(SNAT_SET_WORKERS, snat_set_workers) \ +_(SNAT_WORKER_DUMP, snat_worker_dump) \ +_(SNAT_ADD_DEL_INTERFACE_ADDR, snat_add_del_interface_addr) \ +_(SNAT_INTERFACE_ADDR_DUMP, snat_interface_addr_dump) \ +_(SNAT_IPFIX_ENABLE_DISABLE, snat_ipfix_enable_disable) \ +_(SNAT_USER_DUMP, snat_user_dump) \ +_(SNAT_USER_SESSION_DUMP, snat_user_session_dump) \ +_(SNAT_INTERFACE_ADD_DEL_OUTPUT_FEATURE, \ + snat_interface_add_del_output_feature) \ +_(SNAT_INTERFACE_OUTPUT_FEATURE_DUMP, \ + snat_interface_output_feature_dump) \ +_(SNAT_ADD_DET_MAP, snat_add_det_map) \ +_(SNAT_DET_FORWARD, snat_det_forward) \ +_(SNAT_DET_REVERSE, snat_det_reverse) \ +_(SNAT_DET_MAP_DUMP, snat_det_map_dump) \ +_(SNAT_DET_SET_TIMEOUTS, snat_det_set_timeouts) \ +_(SNAT_DET_GET_TIMEOUTS, snat_det_get_timeouts) \ +_(SNAT_DET_CLOSE_SESSION_OUT, snat_det_close_session_out) \ +_(SNAT_DET_CLOSE_SESSION_IN, snat_det_close_session_in) \ +_(SNAT_DET_SESSION_DUMP, snat_det_session_dump) \ +_(NAT_CONTROL_PING, nat_control_ping) \ +_(NAT_SHOW_CONFIG, nat_show_config) \ +_(NAT_SET_WORKERS, nat_set_workers) \ +_(NAT_WORKER_DUMP, nat_worker_dump) \ +_(NAT_IPFIX_ENABLE_DISABLE, nat_ipfix_enable_disable) \ +_(NAT44_ADD_DEL_ADDRESS_RANGE, nat44_add_del_address_range) \ +_(NAT44_INTERFACE_ADD_DEL_FEATURE, nat44_interface_add_del_feature) \ +_(NAT44_ADD_DEL_STATIC_MAPPING, nat44_add_del_static_mapping) \ +_(NAT44_STATIC_MAPPING_DUMP, nat44_static_mapping_dump) \ +_(NAT44_ADDRESS_DUMP, nat44_address_dump) \ +_(NAT44_INTERFACE_DUMP, nat44_interface_dump) \ +_(NAT44_ADD_DEL_INTERFACE_ADDR, nat44_add_del_interface_addr) \ +_(NAT44_INTERFACE_ADDR_DUMP, nat44_interface_addr_dump) \ +_(NAT44_USER_DUMP, nat44_user_dump) \ +_(NAT44_USER_SESSION_DUMP, nat44_user_session_dump) \ +_(NAT44_INTERFACE_ADD_DEL_OUTPUT_FEATURE, \ + nat44_interface_add_del_output_feature) \ +_(NAT44_INTERFACE_OUTPUT_FEATURE_DUMP, \ + nat44_interface_output_feature_dump) \ +_(NAT44_ADD_DEL_LB_STATIC_MAPPING, nat44_add_del_lb_static_mapping) \ +_(NAT44_LB_STATIC_MAPPING_DUMP, nat44_lb_static_mapping_dump) \ +_(NAT_DET_ADD_DEL_MAP, nat_det_add_del_map) \ +_(NAT_DET_FORWARD, nat_det_forward) \ +_(NAT_DET_REVERSE, nat_det_reverse) \ +_(NAT_DET_MAP_DUMP, nat_det_map_dump) \ +_(NAT_DET_SET_TIMEOUTS, nat_det_set_timeouts) \ +_(NAT_DET_GET_TIMEOUTS, nat_det_get_timeouts) \ +_(NAT_DET_CLOSE_SESSION_OUT, nat_det_close_session_out) \ +_(NAT_DET_CLOSE_SESSION_IN, nat_det_close_session_in) \ +_(NAT_DET_SESSION_DUMP, nat_det_session_dump) \ +_(NAT64_ADD_DEL_POOL_ADDR_RANGE, nat64_add_del_pool_addr_range) \ +_(NAT64_POOL_ADDR_DUMP, nat64_pool_addr_dump) \ +_(NAT64_ADD_DEL_INTERFACE, nat64_add_del_interface) \ +_(NAT64_INTERFACE_DUMP, nat64_interface_dump) \ +_(NAT64_ADD_DEL_STATIC_BIB, nat64_add_del_static_bib) \ +_(NAT64_BIB_DUMP, nat64_bib_dump) \ +_(NAT64_SET_TIMEOUTS, nat64_set_timeouts) \ +_(NAT64_GET_TIMEOUTS, nat64_get_timeouts) \ +_(NAT64_ST_DUMP, nat64_st_dump) \ +_(NAT64_ADD_DEL_PREFIX, nat64_add_del_prefix) \ +_(NAT64_PREFIX_DUMP, nat64_prefix_dump) + +/* Set up the API message handling tables */ +static clib_error_t * +snat_plugin_api_hookup (vlib_main_t * vm) +{ + snat_main_t *sm __attribute__ ((unused)) = &snat_main; +#define _(N,n) \ + vl_msg_api_set_handlers((VL_API_##N + sm->msg_id_base), \ + #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_snat_plugin_api_msg; +#undef _ + + return 0; +} + +#define vl_msg_name_crc_list +#include <nat/nat_all_api_h.h> +#undef vl_msg_name_crc_list + +static void +setup_message_id_table (snat_main_t * sm, api_main_t * am) +{ +#define _(id,n,crc) \ + vl_msg_api_add_msg_name_crc (am, #n "_" #crc, id + sm->msg_id_base); + foreach_vl_msg_name_crc_nat; +#undef _ +} + +static void +plugin_custom_dump_configure (snat_main_t * sm) +{ +#define _(n,f) sm->api_main->msg_print_handlers \ + [VL_API_##n + sm->msg_id_base] \ + = (void *) vl_api_##f##_t_print; + foreach_snat_plugin_api_msg; +#undef _ +} + +clib_error_t * +snat_api_init (vlib_main_t * vm, snat_main_t * sm) +{ + u8 *name; + clib_error_t *error = 0; + + name = format (0, "snat_%08x%c", api_version, 0); + + /* Ask for a correctly-sized block of API message decode slots */ + sm->msg_id_base = + vl_msg_api_get_msg_ids ((char *) name, VL_MSG_FIRST_AVAILABLE); + + error = snat_plugin_api_hookup (vm); + + /* Add our API messages to the global name_crc hash table */ + setup_message_id_table (sm, sm->api_main); + + plugin_custom_dump_configure (sm); + + vec_free (name); + + return error; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/nat/nat_det.c b/src/plugins/nat/nat_det.c new file mode 100644 index 00000000..3af6698c --- /dev/null +++ b/src/plugins/nat/nat_det.c @@ -0,0 +1,158 @@ +/* + * snat_det.c - deterministic NAT + * + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @file + * @brief deterministic NAT + */ + +#include <nat/nat_det.h> + + +/** + * @brief Add/delete deterministic NAT mapping. + * + * Create bijective mapping of inside address to outside address and port range + * pairs, with the purpose of enabling deterministic NAT to reduce logging in + * CGN deployments. + * + * @param sm SNAT main. + * @param in_addr Inside network address. + * @param in_plen Inside network prefix length. + * @param out_addr Outside network address. + * @param out_plen Outside network prefix length. + * @param is_add If 0 delete, otherwise add. + */ +int +snat_det_add_map (snat_main_t * sm, ip4_address_t * in_addr, u8 in_plen, + ip4_address_t * out_addr, u8 out_plen, int is_add) +{ + snat_det_map_t *det_map; + static snat_det_session_t empty_snat_det_session = { 0 }; + snat_interface_t *i; + ip4_address_t in_cmp, out_cmp; + u8 found = 0; + + in_cmp.as_u32 = in_addr->as_u32 & ip4_main.fib_masks[in_plen]; + out_cmp.as_u32 = out_addr->as_u32 & ip4_main.fib_masks[out_plen]; + vec_foreach (det_map, sm->det_maps) + { + /* Checking for overlapping addresses to be added here */ + if (det_map->in_addr.as_u32 == in_cmp.as_u32 && + det_map->in_plen == in_plen && + det_map->out_addr.as_u32 == out_cmp.as_u32 && + det_map->out_plen == out_plen) + { + found = 1; + break; + } + } + + /* If found, don't add again */ + if (found && is_add) + return VNET_API_ERROR_VALUE_EXIST; + + /* If not found, don't delete */ + if (!found && !is_add) + return VNET_API_ERROR_NO_SUCH_ENTRY; + + if (is_add) + { + pool_get (sm->det_maps, det_map); + memset (det_map, 0, sizeof (*det_map)); + det_map->in_addr.as_u32 = in_cmp.as_u32; + det_map->in_plen = in_plen; + det_map->out_addr.as_u32 = out_cmp.as_u32; + det_map->out_plen = out_plen; + det_map->sharing_ratio = (1 << (32 - in_plen)) / (1 << (32 - out_plen)); + det_map->ports_per_host = (65535 - 1023) / det_map->sharing_ratio; + + vec_validate_init_empty (det_map->sessions, + SNAT_DET_SES_PER_USER * (1 << (32 - in_plen)) - + 1, empty_snat_det_session); + } + else + { + vec_free (det_map->sessions); + vec_del1 (sm->det_maps, det_map - sm->det_maps); + } + + /* Add/del external address range to FIB */ + /* *INDENT-OFF* */ + pool_foreach (i, sm->interfaces, + ({ + if (i->is_inside) + continue; + + snat_add_del_addr_to_fib(out_addr, out_plen, i->sw_if_index, is_add); + break; + })); + /* *INDENT-ON* */ + return 0; +} + +/** + * @brief The 'nat-det-expire-walk' process's main loop. + * + * Check expire time for active sessions. + */ +static uword +snat_det_expire_walk_fn (vlib_main_t * vm, vlib_node_runtime_t * rt, + vlib_frame_t * f) +{ + snat_main_t *sm = &snat_main; + snat_det_map_t *dm; + snat_det_session_t *ses; + + while (sm->deterministic) + { + vlib_process_wait_for_event_or_clock (vm, 10.0); + vlib_process_get_events (vm, NULL); + u32 now = (u32) vlib_time_now (vm); + /* *INDENT-OFF* */ + pool_foreach (dm, sm->det_maps, + ({ + vec_foreach(ses, dm->sessions) + { + /* Delete if session expired */ + if (ses->in_port && (ses->expire < now)) + snat_det_ses_close (dm, ses); + } + })); + /* *INDENT-ON* */ + } + + return 0; +} + +static vlib_node_registration_t snat_det_expire_walk_node; + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (snat_det_expire_walk_node, static) = { + .function = snat_det_expire_walk_fn, + .type = VLIB_NODE_TYPE_PROCESS, + .name = + "nat-det-expire-walk", +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/nat/nat_det.h b/src/plugins/nat/nat_det.h new file mode 100644 index 00000000..2ab7f27e --- /dev/null +++ b/src/plugins/nat/nat_det.h @@ -0,0 +1,196 @@ +/* + * snat_det.h - deterministic NAT definitions + * + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @file + * @brief deterministic NAT definitions + */ + +#ifndef __included_nat_det_h__ +#define __included_nat_det_h__ + +#include <vnet/ip/ip.h> +#include <nat/nat.h> +#include <nat/nat_ipfix_logging.h> + + +#define SNAT_DET_SES_PER_USER 1000 + + +int snat_det_add_map (snat_main_t * sm, ip4_address_t * in_addr, u8 in_plen, + ip4_address_t * out_addr, u8 out_plen, int is_add); + +always_inline int +is_addr_in_net (ip4_address_t * addr, ip4_address_t * net, u8 plen) +{ + if (net->as_u32 == (addr->as_u32 & ip4_main.fib_masks[plen])) + return 1; + return 0; +} + +always_inline snat_det_map_t * +snat_det_map_by_user (snat_main_t * sm, ip4_address_t * user_addr) +{ + snat_det_map_t *dm; + + /* *INDENT-OFF* */ + pool_foreach (dm, sm->det_maps, + ({ + if (is_addr_in_net(user_addr, &dm->in_addr, dm->in_plen)) + return dm; + })); + /* *INDENT-ON* */ + return 0; +} + +always_inline snat_det_map_t * +snat_det_map_by_out (snat_main_t * sm, ip4_address_t * out_addr) +{ + snat_det_map_t *dm; + + /* *INDENT-OFF* */ + pool_foreach (dm, sm->det_maps, + ({ + if (is_addr_in_net(out_addr, &dm->out_addr, dm->out_plen)) + return dm; + })); + /* *INDENT-ON* */ + return 0; +} + +always_inline void +snat_det_forward (snat_det_map_t * dm, ip4_address_t * in_addr, + ip4_address_t * out_addr, u16 * lo_port) +{ + u32 in_offset, out_offset; + + in_offset = clib_net_to_host_u32 (in_addr->as_u32) - + clib_net_to_host_u32 (dm->in_addr.as_u32); + out_offset = in_offset / dm->sharing_ratio; + out_addr->as_u32 = + clib_host_to_net_u32 (clib_net_to_host_u32 (dm->out_addr.as_u32) + + out_offset); + *lo_port = 1024 + dm->ports_per_host * (in_offset % dm->sharing_ratio); +} + +always_inline void +snat_det_reverse (snat_det_map_t * dm, ip4_address_t * out_addr, u16 out_port, + ip4_address_t * in_addr) +{ + u32 in_offset1, in_offset2, out_offset; + + out_offset = clib_net_to_host_u32 (out_addr->as_u32) - + clib_net_to_host_u32 (dm->out_addr.as_u32); + in_offset1 = out_offset * dm->sharing_ratio; + in_offset2 = (out_port - 1024) / dm->ports_per_host; + in_addr->as_u32 = + clib_host_to_net_u32 (clib_net_to_host_u32 (dm->in_addr.as_u32) + + in_offset1 + in_offset2); +} + +always_inline u32 +snat_det_user_ses_offset (ip4_address_t * addr, u8 plen) +{ + return (clib_net_to_host_u32 (addr->as_u32) & pow2_mask (32 - plen)) * + SNAT_DET_SES_PER_USER; +} + +always_inline snat_det_session_t * +snat_det_get_ses_by_out (snat_det_map_t * dm, ip4_address_t * in_addr, + u64 out_key) +{ + u32 user_offset; + u16 i; + + user_offset = snat_det_user_ses_offset (in_addr, dm->in_plen); + for (i = 0; i < SNAT_DET_SES_PER_USER; i++) + { + if (dm->sessions[i + user_offset].out.as_u64 == out_key) + return &dm->sessions[i + user_offset]; + } + + return 0; +} + +always_inline snat_det_session_t * +snat_det_find_ses_by_in (snat_det_map_t * dm, ip4_address_t * in_addr, + u16 in_port, snat_det_out_key_t out_key) +{ + snat_det_session_t *ses; + u32 user_offset; + u16 i; + + user_offset = snat_det_user_ses_offset (in_addr, dm->in_plen); + for (i = 0; i < SNAT_DET_SES_PER_USER; i++) + { + ses = &dm->sessions[i + user_offset]; + if (ses->in_port == in_port && + ses->out.ext_host_addr.as_u32 == out_key.ext_host_addr.as_u32 && + ses->out.ext_host_port == out_key.ext_host_port) + return &dm->sessions[i + user_offset]; + } + + return 0; +} + +always_inline snat_det_session_t * +snat_det_ses_create (snat_det_map_t * dm, ip4_address_t * in_addr, + u16 in_port, snat_det_out_key_t * out) +{ + u32 user_offset; + u16 i; + + user_offset = snat_det_user_ses_offset (in_addr, dm->in_plen); + + for (i = 0; i < SNAT_DET_SES_PER_USER; i++) + { + if (!dm->sessions[i + user_offset].in_port) + { + if (__sync_bool_compare_and_swap + (&dm->sessions[i + user_offset].in_port, 0, in_port)) + { + dm->sessions[i + user_offset].out.as_u64 = out->as_u64; + dm->sessions[i + user_offset].state = SNAT_SESSION_UNKNOWN; + dm->sessions[i + user_offset].expire = 0; + __sync_add_and_fetch (&dm->ses_num, 1); + return &dm->sessions[i + user_offset]; + } + } + } + + snat_ipfix_logging_max_entries_per_user (in_addr->as_u32); + return 0; +} + +always_inline void +snat_det_ses_close (snat_det_map_t * dm, snat_det_session_t * ses) +{ + if (__sync_bool_compare_and_swap (&ses->in_port, ses->in_port, 0)) + { + ses->out.as_u64 = 0; + __sync_add_and_fetch (&dm->ses_num, -1); + } +} + +#endif /* __included_nat_det_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/nat/nat_ipfix_logging.c b/src/plugins/nat/nat_ipfix_logging.c new file mode 100644 index 00000000..18430f5a --- /dev/null +++ b/src/plugins/nat/nat_ipfix_logging.c @@ -0,0 +1,848 @@ +/* + * nat_ipfix_logging.c - NAT Events IPFIX logging + * + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vnet/flow/flow_report.h> +#include <vlibmemory/api.h> +#include <nat/nat.h> +#include <nat/nat_ipfix_logging.h> + +snat_ipfix_logging_main_t snat_ipfix_logging_main; + +#define NAT44_SESSION_CREATE_LEN 26 +#define NAT_ADDRESSES_EXHAUTED_LEN 13 +#define MAX_ENTRIES_PER_USER_LEN 17 + +#define NAT44_SESSION_CREATE_FIELD_COUNT 8 +#define NAT_ADDRESSES_EXHAUTED_FIELD_COUNT 3 +#define MAX_ENTRIES_PER_USER_FIELD_COUNT 4 + +typedef struct +{ + u8 nat_event; + u32 src_ip; + u32 nat_src_ip; + snat_protocol_t snat_proto; + u16 src_port; + u16 nat_src_port; + u32 vrf_id; +} snat_ipfix_logging_nat44_ses_args_t; + +typedef struct +{ + u32 pool_id; +} snat_ipfix_logging_addr_exhausted_args_t; + +typedef struct +{ + u32 src_ip; +} snat_ipfix_logging_max_entries_per_user_args_t; + +#define skip_if_disabled() \ +do { \ + snat_ipfix_logging_main_t *silm = &snat_ipfix_logging_main; \ + if (PREDICT_TRUE (!silm->enabled)) \ + return; \ +} while (0) + +/** + * @brief Create an IPFIX template packet rewrite string + * + * @param frm flow report main + * @param fr flow report + * @param collector_address collector address + * @param src_address source address + * @param collector_port collector + * @param event NAT event ID + * @param quota_event NAT quota exceeded event ID + * + * @returns template packet + */ +static inline u8 * +snat_template_rewrite (flow_report_main_t * frm, + flow_report_t * fr, + ip4_address_t * collector_address, + ip4_address_t * src_address, + u16 collector_port, + nat_event_t event, quota_exceed_event_t quota_event) +{ + snat_ipfix_logging_main_t *silm = &snat_ipfix_logging_main; + ip4_header_t *ip; + udp_header_t *udp; + ipfix_message_header_t *h; + ipfix_set_header_t *s; + ipfix_template_header_t *t; + ipfix_field_specifier_t *f; + ipfix_field_specifier_t *first_field; + u8 *rewrite = 0; + ip4_ipfix_template_packet_t *tp; + u32 field_count = 0; + flow_report_stream_t *stream; + + stream = &frm->streams[fr->stream_index]; + silm->stream_index = fr->stream_index; + + if (event == NAT_ADDRESSES_EXHAUTED) + { + field_count = NAT_ADDRESSES_EXHAUTED_FIELD_COUNT; + silm->addr_exhausted_template_id = fr->template_id; + } + else if (event == NAT44_SESSION_CREATE) + { + field_count = NAT44_SESSION_CREATE_FIELD_COUNT; + silm->nat44_session_template_id = fr->template_id; + } + else if (event == QUOTA_EXCEEDED) + { + if (quota_event == MAX_ENTRIES_PER_USER) + { + field_count = MAX_ENTRIES_PER_USER_FIELD_COUNT; + silm->max_entries_per_user_template_id = fr->template_id; + } + } + + /* allocate rewrite space */ + vec_validate_aligned (rewrite, + sizeof (ip4_ipfix_template_packet_t) + + field_count * sizeof (ipfix_field_specifier_t) - 1, + CLIB_CACHE_LINE_BYTES); + + tp = (ip4_ipfix_template_packet_t *) rewrite; + ip = (ip4_header_t *) & tp->ip4; + udp = (udp_header_t *) (ip + 1); + h = (ipfix_message_header_t *) (udp + 1); + s = (ipfix_set_header_t *) (h + 1); + t = (ipfix_template_header_t *) (s + 1); + first_field = f = (ipfix_field_specifier_t *) (t + 1); + + ip->ip_version_and_header_length = 0x45; + ip->ttl = 254; + ip->protocol = IP_PROTOCOL_UDP; + ip->src_address.as_u32 = src_address->as_u32; + ip->dst_address.as_u32 = collector_address->as_u32; + udp->src_port = clib_host_to_net_u16 (stream->src_port); + udp->dst_port = clib_host_to_net_u16 (collector_port); + udp->length = clib_host_to_net_u16 (vec_len (rewrite) - sizeof (*ip)); + + /* FIXUP: message header export_time */ + h->domain_id = clib_host_to_net_u32 (stream->domain_id); + + /* Add TLVs to the template */ + if (event == NAT_ADDRESSES_EXHAUTED) + { + f->e_id_length = ipfix_e_id_length (0, observationTimeMilliseconds, 8); + f++; + f->e_id_length = ipfix_e_id_length (0, natEvent, 1); + f++; + f->e_id_length = ipfix_e_id_length (0, natPoolId, 4); + f++; + } + else if (event == NAT44_SESSION_CREATE) + { + f->e_id_length = ipfix_e_id_length (0, observationTimeMilliseconds, 8); + f++; + f->e_id_length = ipfix_e_id_length (0, natEvent, 1); + f++; + f->e_id_length = ipfix_e_id_length (0, sourceIPv4Address, 4); + f++; + f->e_id_length = ipfix_e_id_length (0, postNATSourceIPv4Address, 4); + f++; + f->e_id_length = ipfix_e_id_length (0, protocolIdentifier, 1); + f++; + f->e_id_length = ipfix_e_id_length (0, sourceTransportPort, 2); + f++; + f->e_id_length = ipfix_e_id_length (0, postNAPTSourceTransportPort, 2); + f++; + f->e_id_length = ipfix_e_id_length (0, ingressVRFID, 4); + f++; + } + else if (event == QUOTA_EXCEEDED) + { + if (quota_event == MAX_ENTRIES_PER_USER) + { + f->e_id_length = ipfix_e_id_length (0, observationTimeMilliseconds, + 8); + f++; + f->e_id_length = ipfix_e_id_length (0, natEvent, 1); + f++; + f->e_id_length = ipfix_e_id_length (0, natQuotaExceededEvent, 4); + f++; + f->e_id_length = ipfix_e_id_length (0, sourceIPv4Address, 4); + f++; + } + } + + /* Back to the template packet... */ + ip = (ip4_header_t *) & tp->ip4; + udp = (udp_header_t *) (ip + 1); + + ASSERT (f - first_field); + /* Field count in this template */ + t->id_count = ipfix_id_count (fr->template_id, f - first_field); + + /* set length in octets */ + s->set_id_length = + ipfix_set_id_length (2 /* set_id */ , (u8 *) f - (u8 *) s); + + /* message length in octets */ + h->version_length = version_length ((u8 *) f - (u8 *) h); + + ip->length = clib_host_to_net_u16 ((u8 *) f - (u8 *) ip); + ip->checksum = ip4_header_checksum (ip); + + return rewrite; +} + +u8 * +snat_template_rewrite_addr_exhausted (flow_report_main_t * frm, + flow_report_t * fr, + ip4_address_t * collector_address, + ip4_address_t * src_address, + u16 collector_port) +{ + return snat_template_rewrite (frm, fr, collector_address, src_address, + collector_port, NAT_ADDRESSES_EXHAUTED, 0); +} + +u8 * +snat_template_rewrite_nat44_session (flow_report_main_t * frm, + flow_report_t * fr, + ip4_address_t * collector_address, + ip4_address_t * src_address, + u16 collector_port) +{ + return snat_template_rewrite (frm, fr, collector_address, src_address, + collector_port, NAT44_SESSION_CREATE, 0); +} + +u8 * +snat_template_rewrite_max_entries_per_usr (flow_report_main_t * frm, + flow_report_t * fr, + ip4_address_t * collector_address, + ip4_address_t * src_address, + u16 collector_port) +{ + return snat_template_rewrite (frm, fr, collector_address, src_address, + collector_port, QUOTA_EXCEEDED, + MAX_ENTRIES_PER_USER); +} + +static inline void +snat_ipfix_header_create (flow_report_main_t * frm, + vlib_buffer_t * b0, u32 * offset) +{ + snat_ipfix_logging_main_t *silm = &snat_ipfix_logging_main; + flow_report_stream_t *stream; + ip4_ipfix_template_packet_t *tp; + ipfix_message_header_t *h = 0; + ipfix_set_header_t *s = 0; + ip4_header_t *ip; + udp_header_t *udp; + + stream = &frm->streams[silm->stream_index]; + + b0->current_data = 0; + b0->current_length = sizeof (*ip) + sizeof (*udp) + sizeof (*h) + + sizeof (*s); + b0->flags |= (VLIB_BUFFER_TOTAL_LENGTH_VALID | VLIB_BUFFER_FLOW_REPORT); + vnet_buffer (b0)->sw_if_index[VLIB_RX] = 0; + vnet_buffer (b0)->sw_if_index[VLIB_TX] = frm->fib_index; + tp = vlib_buffer_get_current (b0); + ip = (ip4_header_t *) & tp->ip4; + udp = (udp_header_t *) (ip + 1); + h = (ipfix_message_header_t *) (udp + 1); + s = (ipfix_set_header_t *) (h + 1); + + ip->ip_version_and_header_length = 0x45; + ip->ttl = 254; + ip->protocol = IP_PROTOCOL_UDP; + ip->flags_and_fragment_offset = 0; + ip->src_address.as_u32 = frm->src_address.as_u32; + ip->dst_address.as_u32 = frm->ipfix_collector.as_u32; + udp->src_port = clib_host_to_net_u16 (stream->src_port); + udp->dst_port = clib_host_to_net_u16 (frm->collector_port); + udp->checksum = 0; + + h->export_time = clib_host_to_net_u32 ((u32) + (((f64) frm->unix_time_0) + + (vlib_time_now (frm->vlib_main) - + frm->vlib_time_0))); + h->sequence_number = clib_host_to_net_u32 (stream->sequence_number++); + h->domain_id = clib_host_to_net_u32 (stream->domain_id); + + *offset = (u32) (((u8 *) (s + 1)) - (u8 *) tp); +} + +static inline void +snat_ipfix_send (flow_report_main_t * frm, + vlib_frame_t * f, vlib_buffer_t * b0, u16 template_id) +{ + ip4_ipfix_template_packet_t *tp; + ipfix_message_header_t *h = 0; + ipfix_set_header_t *s = 0; + ip4_header_t *ip; + udp_header_t *udp; + vlib_main_t *vm = frm->vlib_main; + + tp = vlib_buffer_get_current (b0); + ip = (ip4_header_t *) & tp->ip4; + udp = (udp_header_t *) (ip + 1); + h = (ipfix_message_header_t *) (udp + 1); + s = (ipfix_set_header_t *) (h + 1); + + s->set_id_length = ipfix_set_id_length (template_id, + b0->current_length - + (sizeof (*ip) + sizeof (*udp) + + sizeof (*h))); + h->version_length = version_length (b0->current_length - + (sizeof (*ip) + sizeof (*udp))); + + ip->length = clib_host_to_net_u16 (b0->current_length); + ip->checksum = ip4_header_checksum (ip); + udp->length = clib_host_to_net_u16 (b0->current_length - sizeof (*ip)); + + if (frm->udp_checksum) + { + udp->checksum = ip4_tcp_udp_compute_checksum (vm, b0, ip); + if (udp->checksum == 0) + udp->checksum = 0xffff; + } + + ASSERT (ip->checksum == ip4_header_checksum (ip)); + + vlib_put_frame_to_node (vm, ip4_lookup_node.index, f); +} + +static void +snat_ipfix_logging_nat44_ses (u8 nat_event, u32 src_ip, u32 nat_src_ip, + snat_protocol_t snat_proto, u16 src_port, + u16 nat_src_port, u32 vrf_id, int do_flush) +{ + snat_ipfix_logging_main_t *silm = &snat_ipfix_logging_main; + flow_report_main_t *frm = &flow_report_main; + vlib_frame_t *f; + vlib_buffer_t *b0 = 0; + u32 bi0 = ~0; + u32 offset; + vlib_main_t *vm = frm->vlib_main; + u64 now; + vlib_buffer_free_list_t *fl; + u8 proto = ~0; + + if (!silm->enabled) + return; + + proto = snat_proto_to_ip_proto (snat_proto); + + now = (u64) ((vlib_time_now (vm) - silm->vlib_time_0) * 1e3); + now += silm->milisecond_time_0; + + b0 = silm->nat44_session_buffer; + + if (PREDICT_FALSE (b0 == 0)) + { + if (do_flush) + return; + + if (vlib_buffer_alloc (vm, &bi0, 1) != 1) + { + clib_warning ("can't allocate buffer for NAT IPFIX event"); + return; + } + + b0 = silm->nat44_session_buffer = vlib_get_buffer (vm, bi0); + fl = + vlib_buffer_get_free_list (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + vlib_buffer_init_for_free_list (b0, fl); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); + offset = 0; + } + else + { + bi0 = vlib_get_buffer_index (vm, b0); + offset = silm->nat44_session_next_record_offset; + } + + f = silm->nat44_session_frame; + if (PREDICT_FALSE (f == 0)) + { + u32 *to_next; + f = vlib_get_frame_to_node (vm, ip4_lookup_node.index); + silm->nat44_session_frame = f; + to_next = vlib_frame_vector_args (f); + to_next[0] = bi0; + f->n_vectors = 1; + } + + if (PREDICT_FALSE (offset == 0)) + snat_ipfix_header_create (frm, b0, &offset); + + if (PREDICT_TRUE (do_flush == 0)) + { + u64 time_stamp = clib_host_to_net_u64 (now); + clib_memcpy (b0->data + offset, &time_stamp, sizeof (time_stamp)); + offset += sizeof (time_stamp); + + clib_memcpy (b0->data + offset, &nat_event, sizeof (nat_event)); + offset += sizeof (nat_event); + + clib_memcpy (b0->data + offset, &src_ip, sizeof (src_ip)); + offset += sizeof (src_ip); + + clib_memcpy (b0->data + offset, &nat_src_ip, sizeof (nat_src_ip)); + offset += sizeof (nat_src_ip); + + clib_memcpy (b0->data + offset, &proto, sizeof (proto)); + offset += sizeof (proto); + + clib_memcpy (b0->data + offset, &src_port, sizeof (src_port)); + offset += sizeof (src_port); + + clib_memcpy (b0->data + offset, &nat_src_port, sizeof (nat_src_port)); + offset += sizeof (nat_src_port); + + clib_memcpy (b0->data + offset, &vrf_id, sizeof (vrf_id)); + offset += sizeof (vrf_id); + + b0->current_length += NAT44_SESSION_CREATE_LEN; + } + + if (PREDICT_FALSE + (do_flush || (offset + NAT44_SESSION_CREATE_LEN) > frm->path_mtu)) + { + snat_ipfix_send (frm, f, b0, silm->nat44_session_template_id); + silm->nat44_session_frame = 0; + silm->nat44_session_buffer = 0; + offset = 0; + } + silm->nat44_session_next_record_offset = offset; +} + +static void +snat_ipfix_logging_addr_exhausted (u32 pool_id, int do_flush) +{ + snat_ipfix_logging_main_t *silm = &snat_ipfix_logging_main; + flow_report_main_t *frm = &flow_report_main; + vlib_frame_t *f; + vlib_buffer_t *b0 = 0; + u32 bi0 = ~0; + u32 offset; + vlib_main_t *vm = frm->vlib_main; + u64 now; + vlib_buffer_free_list_t *fl; + u8 nat_event = NAT_ADDRESSES_EXHAUTED; + + if (!silm->enabled) + return; + + now = (u64) ((vlib_time_now (vm) - silm->vlib_time_0) * 1e3); + now += silm->milisecond_time_0; + + b0 = silm->addr_exhausted_buffer; + + if (PREDICT_FALSE (b0 == 0)) + { + if (do_flush) + return; + + if (vlib_buffer_alloc (vm, &bi0, 1) != 1) + { + clib_warning ("can't allocate buffer for NAT IPFIX event"); + return; + } + + b0 = silm->addr_exhausted_buffer = vlib_get_buffer (vm, bi0); + fl = + vlib_buffer_get_free_list (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + vlib_buffer_init_for_free_list (b0, fl); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); + offset = 0; + } + else + { + bi0 = vlib_get_buffer_index (vm, b0); + offset = silm->addr_exhausted_next_record_offset; + } + + f = silm->addr_exhausted_frame; + if (PREDICT_FALSE (f == 0)) + { + u32 *to_next; + f = vlib_get_frame_to_node (vm, ip4_lookup_node.index); + silm->addr_exhausted_frame = f; + to_next = vlib_frame_vector_args (f); + to_next[0] = bi0; + f->n_vectors = 1; + } + + if (PREDICT_FALSE (offset == 0)) + snat_ipfix_header_create (frm, b0, &offset); + + if (PREDICT_TRUE (do_flush == 0)) + { + u64 time_stamp = clib_host_to_net_u64 (now); + clib_memcpy (b0->data + offset, &time_stamp, sizeof (time_stamp)); + offset += sizeof (time_stamp); + + clib_memcpy (b0->data + offset, &nat_event, sizeof (nat_event)); + offset += sizeof (nat_event); + + clib_memcpy (b0->data + offset, &pool_id, sizeof (pool_id)); + offset += sizeof (pool_id); + + b0->current_length += NAT_ADDRESSES_EXHAUTED_LEN; + } + + if (PREDICT_FALSE + (do_flush || (offset + NAT_ADDRESSES_EXHAUTED_LEN) > frm->path_mtu)) + { + snat_ipfix_send (frm, f, b0, silm->addr_exhausted_template_id); + silm->addr_exhausted_frame = 0; + silm->addr_exhausted_buffer = 0; + offset = 0; + } + silm->addr_exhausted_next_record_offset = offset; +} + +static void +snat_ipfix_logging_max_entries_per_usr (u32 src_ip, int do_flush) +{ + snat_ipfix_logging_main_t *silm = &snat_ipfix_logging_main; + flow_report_main_t *frm = &flow_report_main; + vlib_frame_t *f; + vlib_buffer_t *b0 = 0; + u32 bi0 = ~0; + u32 offset; + vlib_main_t *vm = frm->vlib_main; + u64 now; + vlib_buffer_free_list_t *fl; + u8 nat_event = QUOTA_EXCEEDED; + u32 quota_event = MAX_ENTRIES_PER_USER; + + if (!silm->enabled) + return; + + now = (u64) ((vlib_time_now (vm) - silm->vlib_time_0) * 1e3); + now += silm->milisecond_time_0; + + b0 = silm->max_entries_per_user_buffer; + + if (PREDICT_FALSE (b0 == 0)) + { + if (do_flush) + return; + + if (vlib_buffer_alloc (vm, &bi0, 1) != 1) + { + clib_warning ("can't allocate buffer for NAT IPFIX event"); + return; + } + + b0 = silm->max_entries_per_user_buffer = vlib_get_buffer (vm, bi0); + fl = + vlib_buffer_get_free_list (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + vlib_buffer_init_for_free_list (b0, fl); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); + offset = 0; + } + else + { + bi0 = vlib_get_buffer_index (vm, b0); + offset = silm->max_entries_per_user_next_record_offset; + } + + f = silm->max_entries_per_user_frame; + if (PREDICT_FALSE (f == 0)) + { + u32 *to_next; + f = vlib_get_frame_to_node (vm, ip4_lookup_node.index); + silm->max_entries_per_user_frame = f; + to_next = vlib_frame_vector_args (f); + to_next[0] = bi0; + f->n_vectors = 1; + } + + if (PREDICT_FALSE (offset == 0)) + snat_ipfix_header_create (frm, b0, &offset); + + if (PREDICT_TRUE (do_flush == 0)) + { + u64 time_stamp = clib_host_to_net_u64 (now); + clib_memcpy (b0->data + offset, &time_stamp, sizeof (time_stamp)); + offset += sizeof (time_stamp); + + clib_memcpy (b0->data + offset, &nat_event, sizeof (nat_event)); + offset += sizeof (nat_event); + + clib_memcpy (b0->data + offset, "a_event, sizeof (quota_event)); + offset += sizeof (quota_event); + + clib_memcpy (b0->data + offset, &src_ip, sizeof (src_ip)); + offset += sizeof (src_ip); + + b0->current_length += MAX_ENTRIES_PER_USER_LEN; + } + + if (PREDICT_FALSE + (do_flush || (offset + MAX_ENTRIES_PER_USER_LEN) > frm->path_mtu)) + { + snat_ipfix_send (frm, f, b0, silm->max_entries_per_user_template_id); + silm->max_entries_per_user_frame = 0; + silm->max_entries_per_user_buffer = 0; + offset = 0; + } + silm->max_entries_per_user_next_record_offset = offset; +} + +static void +snat_ipfix_logging_nat44_ses_rpc_cb (snat_ipfix_logging_nat44_ses_args_t * a) +{ + snat_ipfix_logging_nat44_ses (a->nat_event, a->src_ip, a->nat_src_ip, + a->snat_proto, a->src_port, a->nat_src_port, + a->vrf_id, 0); +} + +/** + * @brief Generate NAT44 session create event + * + * @param src_ip source IPv4 address + * @param nat_src_ip transaltes source IPv4 address + * @param snat_proto NAT transport protocol + * @param src_port source port + * @param nat_src_port translated source port + * @param vrf_id VRF ID + */ +void +snat_ipfix_logging_nat44_ses_create (u32 src_ip, + u32 nat_src_ip, + snat_protocol_t snat_proto, + u16 src_port, + u16 nat_src_port, u32 vrf_id) +{ + snat_ipfix_logging_nat44_ses_args_t a; + + skip_if_disabled (); + + a.nat_event = NAT44_SESSION_CREATE; + a.src_ip = src_ip; + a.nat_src_ip = nat_src_ip; + a.snat_proto = snat_proto; + a.src_port = src_port; + a.nat_src_port = nat_src_port; + a.vrf_id = vrf_id; + + vl_api_rpc_call_main_thread (snat_ipfix_logging_nat44_ses_rpc_cb, + (u8 *) & a, sizeof (a)); +} + +/** + * @brief Generate NAT44 session delete event + * + * @param src_ip source IPv4 address + * @param nat_src_ip transaltes source IPv4 address + * @param snat_proto NAT transport protocol + * @param src_port source port + * @param nat_src_port translated source port + * @param vrf_id VRF ID + */ +void +snat_ipfix_logging_nat44_ses_delete (u32 src_ip, + u32 nat_src_ip, + snat_protocol_t snat_proto, + u16 src_port, + u16 nat_src_port, u32 vrf_id) +{ + snat_ipfix_logging_nat44_ses_args_t a; + + skip_if_disabled (); + + a.nat_event = NAT44_SESSION_DELETE; + a.src_ip = src_ip; + a.nat_src_ip = nat_src_ip; + a.snat_proto = snat_proto; + a.src_port = src_port; + a.nat_src_port = nat_src_port; + a.vrf_id = vrf_id; + + vl_api_rpc_call_main_thread (snat_ipfix_logging_nat44_ses_rpc_cb, + (u8 *) & a, sizeof (a)); +} + +vlib_frame_t * +snat_data_callback_nat44_session (flow_report_main_t * frm, + flow_report_t * fr, + vlib_frame_t * f, + u32 * to_next, u32 node_index) +{ + snat_ipfix_logging_nat44_ses (0, 0, 0, 0, 0, 0, 0, 1); + return f; +} + +static void + snat_ipfix_logging_addr_exhausted_rpc_cb + (snat_ipfix_logging_addr_exhausted_args_t * a) +{ + snat_ipfix_logging_addr_exhausted (a->pool_id, 0); +} + +/** + * @brief Generate NAT addresses exhausted event + * + * @param pool_id NAT pool ID + */ +void +snat_ipfix_logging_addresses_exhausted (u32 pool_id) +{ + //TODO: This event SHOULD be rate limited + snat_ipfix_logging_addr_exhausted_args_t a; + + skip_if_disabled (); + + a.pool_id = pool_id; + + vl_api_rpc_call_main_thread (snat_ipfix_logging_addr_exhausted_rpc_cb, + (u8 *) & a, sizeof (a)); +} + +vlib_frame_t * +snat_data_callback_addr_exhausted (flow_report_main_t * frm, + flow_report_t * fr, + vlib_frame_t * f, + u32 * to_next, u32 node_index) +{ + snat_ipfix_logging_addr_exhausted (0, 1); + return f; +} + +static void + snat_ipfix_logging_max_entries_per_usr_rpc_cb + (snat_ipfix_logging_max_entries_per_user_args_t * a) +{ + snat_ipfix_logging_max_entries_per_usr (a->src_ip, 0); +} + +/** + * @brief Generate maximum entries per user exceeded event + * + * @param src_ip source IPv4 address + */ +void +snat_ipfix_logging_max_entries_per_user (u32 src_ip) +{ + //TODO: This event SHOULD be rate limited + snat_ipfix_logging_max_entries_per_user_args_t a; + + skip_if_disabled (); + + a.src_ip = src_ip; + + vl_api_rpc_call_main_thread (snat_ipfix_logging_max_entries_per_usr_rpc_cb, + (u8 *) & a, sizeof (a)); +} + +vlib_frame_t * +snat_data_callback_max_entries_per_usr (flow_report_main_t * frm, + flow_report_t * fr, + vlib_frame_t * f, + u32 * to_next, u32 node_index) +{ + snat_ipfix_logging_max_entries_per_usr (0, 1); + return f; +} + +/** + * @brief Enable/disable NAT plugin IPFIX logging + * + * @param enable 1 if enable, 0 if disable + * @param domain_id observation domain ID + * @param src_port source port number + * + * @returns 0 if success + */ +int +snat_ipfix_logging_enable_disable (int enable, u32 domain_id, u16 src_port) +{ + snat_main_t *sm = &snat_main; + snat_ipfix_logging_main_t *silm = &snat_ipfix_logging_main; + flow_report_main_t *frm = &flow_report_main; + vnet_flow_report_add_del_args_t a; + int rv; + u8 e = enable ? 1 : 0; + + if (silm->enabled == e) + return 0; + + silm->enabled = e; + + memset (&a, 0, sizeof (a)); + a.is_add = enable; + a.domain_id = domain_id ? domain_id : 1; + a.src_port = src_port ? src_port : UDP_DST_PORT_ipfix; + + if (sm->deterministic) + { + a.rewrite_callback = snat_template_rewrite_max_entries_per_usr; + a.flow_data_callback = snat_data_callback_max_entries_per_usr; + + rv = vnet_flow_report_add_del (frm, &a, NULL); + if (rv) + { + clib_warning ("vnet_flow_report_add_del returned %d", rv); + return -1; + } + } + else + { + a.rewrite_callback = snat_template_rewrite_nat44_session; + a.flow_data_callback = snat_data_callback_nat44_session; + + rv = vnet_flow_report_add_del (frm, &a, NULL); + if (rv) + { + clib_warning ("vnet_flow_report_add_del returned %d", rv); + return -1; + } + + a.rewrite_callback = snat_template_rewrite_addr_exhausted; + a.flow_data_callback = snat_data_callback_addr_exhausted; + + rv = vnet_flow_report_add_del (frm, &a, NULL); + if (rv) + { + clib_warning ("vnet_flow_report_add_del returned %d", rv); + return -1; + } + } + + return 0; +} + +/** + * @brief Initialize NAT plugin IPFIX logging + * + * @param vm vlib main + */ +void +snat_ipfix_logging_init (vlib_main_t * vm) +{ + snat_ipfix_logging_main_t *silm = &snat_ipfix_logging_main; + + silm->enabled = 0; + + /* Set up time reference pair */ + silm->vlib_time_0 = vlib_time_now (vm); + silm->milisecond_time_0 = unix_time_now_nsec () * 1e-6; +} diff --git a/src/plugins/nat/nat_ipfix_logging.h b/src/plugins/nat/nat_ipfix_logging.h new file mode 100644 index 00000000..6dbf6627 --- /dev/null +++ b/src/plugins/nat/nat_ipfix_logging.h @@ -0,0 +1,79 @@ +/* + * nat_ipfix_logging.h - NAT Events IPFIX logging + * + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_nat_ipfix_logging_h__ +#define __included_nat_ipfix_logging_h__ + +typedef enum { + NAT_ADDRESSES_EXHAUTED = 3, + NAT44_SESSION_CREATE = 4, + NAT44_SESSION_DELETE = 5, + NAT_PORTS_EXHAUSTED = 12, + QUOTA_EXCEEDED = 13, +} nat_event_t; + +typedef enum { + MAX_ENTRIES_PER_USER = 3, +} quota_exceed_event_t; + +typedef struct { + /** NAT plugin IPFIX logging enabled */ + u8 enabled; + + /** ipfix buffers under construction */ + vlib_buffer_t *nat44_session_buffer; + vlib_buffer_t *addr_exhausted_buffer; + vlib_buffer_t *max_entries_per_user_buffer; + + /** frames containing ipfix buffers */ + vlib_frame_t *nat44_session_frame; + vlib_frame_t *addr_exhausted_frame; + vlib_frame_t *max_entries_per_user_frame; + + /** next record offset */ + u32 nat44_session_next_record_offset; + u32 addr_exhausted_next_record_offset; + u32 max_entries_per_user_next_record_offset; + + /** Time reference pair */ + u64 milisecond_time_0; + f64 vlib_time_0; + + /** template IDs */ + u16 nat44_session_template_id; + u16 addr_exhausted_template_id; + u16 max_entries_per_user_template_id; + + /** stream index */ + u32 stream_index; +} snat_ipfix_logging_main_t; + +extern snat_ipfix_logging_main_t snat_ipfix_logging_main; + +void snat_ipfix_logging_init (vlib_main_t * vm); +int snat_ipfix_logging_enable_disable (int enable, u32 domain_id, u16 src_port); +void snat_ipfix_logging_nat44_ses_create (u32 src_ip, u32 nat_src_ip, + snat_protocol_t snat_proto, + u16 src_port, u16 nat_src_port, + u32 vrf_id); +void snat_ipfix_logging_nat44_ses_delete (u32 src_ip, u32 nat_src_ip, + snat_protocol_t snat_proto, + u16 src_port, u16 nat_src_port, + u32 vrf_id); +void snat_ipfix_logging_addresses_exhausted(u32 pool_id); +void snat_ipfix_logging_max_entries_per_user(u32 src_ip); + +#endif /* __included_nat_ipfix_logging_h__ */ diff --git a/src/plugins/nat/nat_msg_enum.h b/src/plugins/nat/nat_msg_enum.h new file mode 100644 index 00000000..710b631c --- /dev/null +++ b/src/plugins/nat/nat_msg_enum.h @@ -0,0 +1,31 @@ + +/* + * nat_msg_enum.h - skeleton vpp engine plug-in message enumeration + * + * Copyright (c) <current-year> <your-organization> + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef included_nat_msg_enum_h +#define included_nat_msg_enum_h + +#include <vppinfra/byte_order.h> + +#define vl_msg_id(n,h) n, +typedef enum { +#include <nat/nat_all_api_h.h> + /* We'll want to know how many messages IDs we need... */ + VL_MSG_FIRST_AVAILABLE, +} vl_msg_id_t; +#undef vl_msg_id + +#endif /* included_nat_msg_enum_h */ diff --git a/src/plugins/nat/nat_test.c b/src/plugins/nat/nat_test.c new file mode 100644 index 00000000..e0b04940 --- /dev/null +++ b/src/plugins/nat/nat_test.c @@ -0,0 +1,1167 @@ + +/* + * nat.c - skeleton vpp-api-test plug-in + * + * Copyright (c) <current-year> <your-organization> + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vat/vat.h> +#include <vlibapi/api.h> +#include <vlibmemory/api.h> +#include <vlibsocket/api.h> +#include <vppinfra/error.h> +#include <vnet/ip/ip.h> +#include <nat/nat.h> + +#define __plugin_msg_base snat_test_main.msg_id_base +#include <vlibapi/vat_helper_macros.h> + +uword unformat_sw_if_index (unformat_input_t * input, va_list * args); + +/* Declare message IDs */ +#include <nat/nat_msg_enum.h> + +/* define message structures */ +#define vl_typedefs +#include <nat/nat_all_api_h.h> +#undef vl_typedefs + +/* declare message handlers for each api */ + +#define vl_endianfun /* define message structures */ +#include <nat/nat_all_api_h.h> +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) +#define vl_printfun +#include <nat/nat_all_api_h.h> +#undef vl_printfun + +/* Get the API version number. */ +#define vl_api_version(n,v) static u32 api_version=(v); +#include <nat/nat_all_api_h.h> +#undef vl_api_version + +typedef struct { + /* API message ID base */ + u16 msg_id_base; + vat_main_t *vat_main; +} snat_test_main_t; + +snat_test_main_t snat_test_main; + +#define foreach_standard_reply_retval_handler \ +_(snat_add_address_range_reply) \ +_(snat_interface_add_del_feature_reply) \ +_(snat_add_static_mapping_reply) \ +_(snat_set_workers_reply) \ +_(snat_add_del_interface_addr_reply) \ +_(snat_ipfix_enable_disable_reply) \ +_(snat_add_det_map_reply) \ +_(snat_det_set_timeouts_reply) \ +_(snat_det_close_session_out_reply) \ +_(snat_det_close_session_in_reply) + +#define _(n) \ + static void vl_api_##n##_t_handler \ + (vl_api_##n##_t * mp) \ + { \ + vat_main_t * vam = snat_test_main.vat_main; \ + i32 retval = ntohl(mp->retval); \ + if (vam->async_mode) { \ + vam->async_errors += (retval < 0); \ + } else { \ + vam->retval = retval; \ + vam->result_ready = 1; \ + } \ + } +foreach_standard_reply_retval_handler; +#undef _ + +/* + * Table of message reply handlers, must include boilerplate handlers + * we just generated + */ +#define foreach_vpe_api_reply_msg \ +_(SNAT_ADD_ADDRESS_RANGE_REPLY, snat_add_address_range_reply) \ +_(SNAT_INTERFACE_ADD_DEL_FEATURE_REPLY, \ + snat_interface_add_del_feature_reply) \ +_(SNAT_ADD_STATIC_MAPPING_REPLY, snat_add_static_mapping_reply) \ +_(SNAT_CONTROL_PING_REPLY, snat_control_ping_reply) \ +_(SNAT_STATIC_MAPPING_DETAILS, snat_static_mapping_details) \ +_(SNAT_SHOW_CONFIG_REPLY, snat_show_config_reply) \ +_(SNAT_ADDRESS_DETAILS, snat_address_details) \ +_(SNAT_INTERFACE_DETAILS, snat_interface_details) \ +_(SNAT_SET_WORKERS_REPLY, snat_set_workers_reply) \ +_(SNAT_WORKER_DETAILS, snat_worker_details) \ +_(SNAT_ADD_DEL_INTERFACE_ADDR_REPLY, \ + snat_add_del_interface_addr_reply) \ +_(SNAT_INTERFACE_ADDR_DETAILS, snat_interface_addr_details) \ +_(SNAT_IPFIX_ENABLE_DISABLE_REPLY, \ + snat_ipfix_enable_disable_reply) \ +_(SNAT_USER_DETAILS, snat_user_details) \ +_(SNAT_USER_SESSION_DETAILS, snat_user_session_details) \ +_(SNAT_ADD_DET_MAP_REPLY, snat_add_det_map_reply) \ +_(SNAT_DET_FORWARD_REPLY, snat_det_forward_reply) \ +_(SNAT_DET_REVERSE_REPLY, snat_det_reverse_reply) \ +_(SNAT_DET_MAP_DETAILS, snat_det_map_details) \ +_(SNAT_DET_SET_TIMEOUTS_REPLY, snat_det_set_timeouts_reply) \ +_(SNAT_DET_GET_TIMEOUTS_REPLY, snat_det_get_timeouts_reply) \ +_(SNAT_DET_CLOSE_SESSION_OUT_REPLY, \ + snat_det_close_session_out_reply) \ +_(SNAT_DET_CLOSE_SESSION_IN_REPLY, \ + snat_det_close_session_in_reply) \ +_(SNAT_DET_SESSION_DETAILS, snat_det_session_details) + +static int api_snat_add_address_range (vat_main_t * vam) +{ + unformat_input_t * i = vam->input; + ip4_address_t start_addr, end_addr; + u32 start_host_order, end_host_order; + vl_api_snat_add_address_range_t * mp; + u8 is_add = 1; + int count; + int ret; + + while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT) + { + if (unformat (i, "%U - %U", + unformat_ip4_address, &start_addr, + unformat_ip4_address, &end_addr)) + ; + else if (unformat (i, "%U", unformat_ip4_address, &start_addr)) + end_addr = start_addr; + else if (unformat (i, "del")) + is_add = 0; + else + { + clib_warning("unknown input '%U'", format_unformat_error, i); + return -99; + } + } + + start_host_order = clib_host_to_net_u32 (start_addr.as_u32); + end_host_order = clib_host_to_net_u32 (end_addr.as_u32); + + if (end_host_order < start_host_order) + { + errmsg ("end address less than start address\n"); + return -99; + } + + count = (end_host_order - start_host_order) + 1; + + if (count > 1024) + { + errmsg ("%U - %U, %d addresses...\n", + format_ip4_address, &start_addr, + format_ip4_address, &end_addr, + count); + } + + M(SNAT_ADD_ADDRESS_RANGE, mp); + + memcpy (mp->first_ip_address, &start_addr, 4); + memcpy (mp->last_ip_address, &end_addr, 4); + mp->is_ip4 = 1; + mp->is_add = is_add; + + S(mp); + W (ret); + return ret; +} + +static int api_snat_interface_add_del_feature (vat_main_t * vam) +{ + unformat_input_t * i = vam->input; + vl_api_snat_interface_add_del_feature_t * mp; + u32 sw_if_index; + u8 sw_if_index_set = 0; + u8 is_inside = 1; + u8 is_add = 1; + int ret; + + while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT) + { + if (unformat (i, "%U", unformat_sw_if_index, vam, &sw_if_index)) + sw_if_index_set = 1; + else if (unformat (i, "sw_if_index %d", &sw_if_index)) + sw_if_index_set = 1; + else if (unformat (i, "out")) + is_inside = 0; + else if (unformat (i, "in")) + is_inside = 1; + else if (unformat (i, "del")) + is_add = 0; + else + { + clib_warning("unknown input '%U'", format_unformat_error, i); + return -99; + } + } + + if (sw_if_index_set == 0) + { + errmsg ("interface / sw_if_index required\n"); + return -99; + } + + M(SNAT_INTERFACE_ADD_DEL_FEATURE, mp); + mp->sw_if_index = ntohl(sw_if_index); + mp->is_add = is_add; + mp->is_inside = is_inside; + + S(mp); + W (ret); + return ret; +} + +static int api_snat_add_static_mapping(vat_main_t * vam) +{ + unformat_input_t * i = vam->input; + vl_api_snat_add_static_mapping_t * mp; + u8 external_addr_set = 0; + u8 local_addr_set = 0; + u8 is_add = 1; + u8 addr_only = 1; + ip4_address_t local_addr, external_addr; + u32 local_port = 0, external_port = 0, vrf_id = ~0; + u32 sw_if_index = ~0; + u8 sw_if_index_set = 0; + u32 proto = ~0; + u8 proto_set = 0; + int ret; + + while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT) + { + if (unformat (i, "local_addr %U", unformat_ip4_address, &local_addr)) + local_addr_set = 1; + else if (unformat (i, "external_addr %U", unformat_ip4_address, + &external_addr)) + external_addr_set = 1; + else if (unformat (i, "local_port %u", &local_port)) + addr_only = 0; + else if (unformat (i, "external_port %u", &external_port)) + addr_only = 0; + else if (unformat (i, "external_if %U", unformat_sw_if_index, vam, + &sw_if_index)) + sw_if_index_set = 1; + else if (unformat (i, "external_sw_if_index %d", &sw_if_index)) + sw_if_index_set = 1; + else if (unformat (i, "vrf %u", &vrf_id)) + ; + else if (unformat (i, "protocol %u", &proto)) + proto_set = 1; + else if (unformat (i, "del")) + is_add = 0; + else + { + clib_warning("unknown input '%U'", format_unformat_error, i); + return -99; + } + } + + if (!addr_only && !proto_set) + { + errmsg ("protocol required\n"); + return -99; + } + + if (!local_addr_set) + { + errmsg ("local addr required\n"); + return -99; + } + if (!external_addr_set && !sw_if_index_set) + { + errmsg ("external addr or interface required\n"); + return -99; + } + + M(SNAT_ADD_STATIC_MAPPING, mp); + mp->is_add = is_add; + mp->is_ip4 = 1; + mp->addr_only = addr_only; + mp->local_port = ntohs ((u16) local_port); + mp->external_port = ntohs ((u16) external_port); + mp->external_sw_if_index = ntohl (sw_if_index); + mp->vrf_id = ntohl (vrf_id); + mp->protocol = (u8) proto; + memcpy (mp->local_ip_address, &local_addr, 4); + memcpy (mp->external_ip_address, &external_addr, 4); + + S(mp); + W (ret); + return ret; +} + +static void vl_api_snat_control_ping_reply_t_handler + (vl_api_snat_control_ping_reply_t * mp) +{ + vat_main_t *vam = &vat_main; + i32 retval = ntohl (mp->retval); + if (vam->async_mode) + { + vam->async_errors += (retval < 0); + } + else + { + vam->retval = retval; + vam->result_ready = 1; + } +} + +static void vl_api_snat_static_mapping_details_t_handler + (vl_api_snat_static_mapping_details_t *mp) +{ + snat_test_main_t * sm = &snat_test_main; + vat_main_t *vam = sm->vat_main; + + if (mp->addr_only && mp->external_sw_if_index != ~0) + fformat (vam->ofp, "%15U%6s%15d%6s%11d%6d\n", + format_ip4_address, &mp->local_ip_address, "", + ntohl (mp->external_sw_if_index), "", + ntohl (mp->vrf_id), + mp->protocol); + else if (mp->addr_only && mp->external_sw_if_index == ~0) + fformat (vam->ofp, "%15U%6s%15U%6s%11d%6d\n", + format_ip4_address, &mp->local_ip_address, "", + format_ip4_address, &mp->external_ip_address, "", + ntohl (mp->vrf_id), + mp->protocol); + else if (!mp->addr_only && mp->external_sw_if_index != ~0) + fformat (vam->ofp, "%15U%6d%15d%6d%11d%6d\n", + format_ip4_address, &mp->local_ip_address, + ntohs (mp->local_port), + ntohl (mp->external_sw_if_index), + ntohs (mp->external_port), + ntohl (mp->vrf_id), + mp->protocol); + else + fformat (vam->ofp, "%15U%6d%15U%6d%11d%6d\n", + format_ip4_address, &mp->local_ip_address, + ntohs (mp->local_port), + format_ip4_address, &mp->external_ip_address, + ntohs (mp->external_port), + ntohl (mp->vrf_id), + mp->protocol); + +} + +static int api_snat_static_mapping_dump(vat_main_t * vam) +{ + vl_api_snat_static_mapping_dump_t * mp; + vl_api_snat_control_ping_t *mp_ping; + int ret; + + if (vam->json_output) + { + clib_warning ("JSON output not supported for snat_static_mapping_dump"); + return -99; + } + + fformat (vam->ofp, "%21s%21s\n", "local", "external"); + fformat (vam->ofp, "%15s%6s%15s%6s%11s%6s\n", "address", "port", + "address/if_idx", "port", "vrf", "proto"); + + M(SNAT_STATIC_MAPPING_DUMP, mp); + S(mp); + + /* Use a control ping for synchronization */ + M(SNAT_CONTROL_PING, mp_ping); + S(mp_ping); + + W (ret); + return ret; +} + +static void vl_api_snat_show_config_reply_t_handler + (vl_api_snat_show_config_reply_t *mp) +{ + snat_test_main_t * sm = &snat_test_main; + vat_main_t *vam = sm->vat_main; + i32 retval = ntohl (mp->retval); + + if (retval >= 0) + { + fformat (vam->ofp, "translation hash buckets %d\n", + ntohl (mp->translation_buckets)); + fformat (vam->ofp, "translation hash memory %d\n", + ntohl (mp->translation_memory_size)); + fformat (vam->ofp, "user hash buckets %d\n", ntohl (mp->user_buckets)); + fformat (vam->ofp, "user hash memory %d\n", ntohl (mp->user_memory_size)); + fformat (vam->ofp, "max translations per user %d\n", + ntohl (mp->max_translations_per_user)); + fformat (vam->ofp, "outside VRF id %d\n", ntohl (mp->outside_vrf_id)); + fformat (vam->ofp, "inside VRF id %d\n", ntohl (mp->inside_vrf_id)); + if (mp->static_mapping_only) + { + fformat (vam->ofp, "static mapping only"); + if (mp->static_mapping_connection_tracking) + fformat (vam->ofp, " connection tracking"); + fformat (vam->ofp, "\n"); + } + } + vam->retval = retval; + vam->result_ready = 1; +} + +static int api_snat_show_config(vat_main_t * vam) +{ + vl_api_snat_show_config_t * mp; + int ret; + + if (vam->json_output) + { + clib_warning ("JSON output not supported for snat_show_config"); + return -99; + } + + M(SNAT_SHOW_CONFIG, mp); + S(mp); + W (ret); + return ret; +} + +static void vl_api_snat_address_details_t_handler + (vl_api_snat_address_details_t *mp) +{ + snat_test_main_t * sm = &snat_test_main; + vat_main_t *vam = sm->vat_main; + + fformat (vam->ofp, "%U\n", format_ip4_address, &mp->ip_address); +} + +static int api_snat_address_dump(vat_main_t * vam) +{ + vl_api_snat_address_dump_t * mp; + vl_api_snat_control_ping_t *mp_ping; + int ret; + + if (vam->json_output) + { + clib_warning ("JSON output not supported for snat_address_dump"); + return -99; + } + + M(SNAT_ADDRESS_DUMP, mp); + S(mp); + + /* Use a control ping for synchronization */ + M(SNAT_CONTROL_PING, mp_ping); + S(mp_ping); + + W (ret); + return ret; +} + +static void vl_api_snat_interface_details_t_handler + (vl_api_snat_interface_details_t *mp) +{ + snat_test_main_t * sm = &snat_test_main; + vat_main_t *vam = sm->vat_main; + + fformat (vam->ofp, "sw_if_index %d %s\n", ntohl (mp->sw_if_index), + mp->is_inside ? "in" : "out"); +} + +static int api_snat_interface_dump(vat_main_t * vam) +{ + vl_api_snat_interface_dump_t * mp; + vl_api_snat_control_ping_t *mp_ping; + int ret; + + if (vam->json_output) + { + clib_warning ("JSON output not supported for snat_address_dump"); + return -99; + } + + M(SNAT_INTERFACE_DUMP, mp); + S(mp); + + /* Use a control ping for synchronization */ + M(SNAT_CONTROL_PING, mp_ping); + S(mp_ping); + + W (ret); + return ret; +} + +static int api_snat_set_workers (vat_main_t * vam) +{ + unformat_input_t * i = vam->input; + vl_api_snat_set_workers_t * mp; + uword *bitmap; + int ret; + + while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT) + { + if (unformat (i, "%U", unformat_bitmap_list, &bitmap)) + ; + else + { + clib_warning("unknown input '%U'", format_unformat_error, i); + return -99; + } + } + + M(SNAT_SET_WORKERS, mp); + mp->worker_mask = clib_host_to_net_u64 (bitmap[0]); + + S(mp); + W (ret); + return ret; +} + +static void vl_api_snat_worker_details_t_handler + (vl_api_snat_worker_details_t *mp) +{ + snat_test_main_t * sm = &snat_test_main; + vat_main_t *vam = sm->vat_main; + + fformat (vam->ofp, "worker_index %d (%s at lcore %u)\n", + ntohl (mp->worker_index), mp->name, ntohl (mp->lcore_id)); +} + +static int api_snat_worker_dump(vat_main_t * vam) +{ + vl_api_snat_worker_dump_t * mp; + vl_api_snat_control_ping_t *mp_ping; + int ret; + + if (vam->json_output) + { + clib_warning ("JSON output not supported for snat_address_dump"); + return -99; + } + + M(SNAT_WORKER_DUMP, mp); + S(mp); + + /* Use a control ping for synchronization */ + M(SNAT_CONTROL_PING, mp_ping); + S(mp_ping); + + W (ret); + return ret; +} + +static int api_snat_add_del_interface_addr (vat_main_t * vam) +{ + unformat_input_t * i = vam->input; + vl_api_snat_add_del_interface_addr_t * mp; + u32 sw_if_index; + u8 sw_if_index_set = 0; + u8 is_add = 1; + int ret; + + while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT) + { + if (unformat (i, "%U", unformat_sw_if_index, vam, &sw_if_index)) + sw_if_index_set = 1; + else if (unformat (i, "sw_if_index %d", &sw_if_index)) + sw_if_index_set = 1; + else if (unformat (i, "del")) + is_add = 0; + else + { + clib_warning("unknown input '%U'", format_unformat_error, i); + return -99; + } + } + + if (sw_if_index_set == 0) + { + errmsg ("interface / sw_if_index required\n"); + return -99; + } + + M(SNAT_ADD_DEL_INTERFACE_ADDR, mp); + mp->sw_if_index = ntohl(sw_if_index); + mp->is_add = is_add; + + S(mp); + W (ret); + return ret; +} + +static void vl_api_snat_interface_addr_details_t_handler + (vl_api_snat_interface_addr_details_t *mp) +{ + snat_test_main_t * sm = &snat_test_main; + vat_main_t *vam = sm->vat_main; + + fformat (vam->ofp, "sw_if_index %d\n", ntohl (mp->sw_if_index)); +} + +static int api_snat_interface_addr_dump(vat_main_t * vam) +{ + vl_api_snat_interface_addr_dump_t * mp; + vl_api_snat_control_ping_t *mp_ping; + int ret; + + if (vam->json_output) + { + clib_warning ("JSON output not supported for snat_address_dump"); + return -99; + } + + M(SNAT_INTERFACE_ADDR_DUMP, mp); + S(mp); + + /* Use a control ping for synchronization */ + M(SNAT_CONTROL_PING, mp_ping); + S(mp_ping); + + W (ret); + return ret; +} + +static int api_snat_ipfix_enable_disable (vat_main_t * vam) +{ + unformat_input_t * i = vam->input; + vl_api_snat_ipfix_enable_disable_t * mp; + u32 domain_id = 0; + u32 src_port = 0; + u8 enable = 1; + int ret; + + while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT) + { + if (unformat (i, "domain %d", &domain_id)) + ; + else if (unformat (i, "src_port %d", &src_port)) + ; + else if (unformat (i, "disable")) + enable = 0; + else + { + clib_warning("unknown input '%U'", format_unformat_error, i); + return -99; + } + } + + M(SNAT_IPFIX_ENABLE_DISABLE, mp); + mp->domain_id = htonl(domain_id); + mp->src_port = htons((u16) src_port); + mp->enable = enable; + + S(mp); + W (ret); + return ret; +} + +static void vl_api_snat_user_session_details_t_handler + (vl_api_snat_user_session_details_t *mp) +{ + snat_test_main_t * sm = &snat_test_main; + vat_main_t *vam = sm->vat_main; + + fformat(vam->ofp, "%s session %U:%d to %U:%d protocol id %d " + "total packets %d total bytes %d\n", + mp->is_static ? "static" : "dynamic", + format_ip4_address, mp->inside_ip_address, ntohl(mp->inside_port), + format_ip4_address, mp->outside_ip_address, ntohl(mp->outside_port), + ntohl(mp->protocol), ntohl(mp->total_pkts), ntohl(mp->total_bytes)); +} + +static int api_snat_user_session_dump(vat_main_t * vam) +{ + unformat_input_t* i = vam->input; + vl_api_snat_user_session_dump_t * mp; + vl_api_snat_control_ping_t *mp_ping; + ip4_address_t addr; + u32 vrf_id = ~0; + int ret; + + if (vam->json_output) + { + clib_warning ("JSON output not supported for snat_address_dump"); + return -99; + } + + if (unformat (i, "ip_address %U vrf_id %d", + unformat_ip4_address, &addr, &vrf_id)) + ; + else + { + clib_warning("unknown input '%U'", format_unformat_error, i); + return -99; + } + + M(SNAT_USER_SESSION_DUMP, mp); + S(mp); + + /* Use a control ping for synchronization */ + M(SNAT_CONTROL_PING, mp_ping); + memset(mp->ip_address, 0, 16); + clib_memcpy(mp->ip_address, &addr, 4); + mp->vrf_id = htonl(vrf_id); + mp->is_ip4 = 1; + S(mp_ping); + + W (ret); + return ret; +} + +static void vl_api_snat_user_details_t_handler + (vl_api_snat_user_details_t *mp) +{ + snat_test_main_t * sm = &snat_test_main; + vat_main_t *vam = sm->vat_main; + + fformat(vam->ofp, "user with ip %U with vrf_id %d " + "with %d sessions and %d static sessions\n", + format_ip4_address, mp->ip_address, ntohl(mp->vrf_id), + ntohl(mp->nsessions), ntohl(mp->nstaticsessions)); +} + +static int api_snat_user_dump(vat_main_t * vam) +{ + vl_api_snat_user_dump_t * mp; + vl_api_snat_control_ping_t *mp_ping; + int ret; + + if (vam->json_output) + { + clib_warning ("JSON output not supported for snat_address_dump"); + return -99; + } + + M(SNAT_USER_DUMP, mp); + S(mp); + + /* Use a control ping for synchronization */ + M(SNAT_CONTROL_PING, mp_ping); + S(mp_ping); + + W (ret); + return ret; +} + +static int api_snat_add_det_map (vat_main_t * vam) +{ + unformat_input_t * i = vam->input; + vl_api_snat_add_det_map_t * mp; + ip4_address_t in_addr, out_addr; + u32 in_plen, out_plen; + u8 is_add = 1; + int ret; + + if (unformat (i, "in %U/%d out %U/%d", + unformat_ip4_address, &in_addr, &in_plen, + unformat_ip4_address, &out_addr, &out_plen)) + ; + else if (unformat (i, "del")) + is_add = 0; + else + { + clib_warning("unknown input '%U'", format_unformat_error, i); + return -99; + } + + M(SNAT_ADD_DET_MAP, mp); + clib_memcpy(mp->in_addr, &in_addr, 4); + mp->in_plen = in_plen; + clib_memcpy(mp->out_addr, &out_addr, 4); + mp->out_plen = out_plen; + mp->is_add = is_add; + + S(mp); + W (ret); + return ret; +} + +static void vl_api_snat_det_forward_reply_t_handler + (vl_api_snat_det_forward_reply_t *mp) +{ + snat_test_main_t * sm = &snat_test_main; + vat_main_t *vam = sm->vat_main; + i32 retval = ntohl(mp->retval); + + if (retval >= 0) + { + fformat (vam->ofp, "outside address %U", format_ip4_address, &mp->out_addr); + fformat (vam->ofp, " outside port range start %d", ntohs(mp->out_port_lo)); + fformat (vam->ofp, " outside port range end %d\n", ntohs(mp->out_port_hi)); + } + + vam->retval = retval; + vam->result_ready = 1; +} + +static int api_snat_det_forward (vat_main_t * vam) +{ + unformat_input_t * i = vam->input; + vl_api_snat_det_forward_t * mp; + ip4_address_t in_addr; + int ret; + + if (unformat (i, "%U", unformat_ip4_address, &in_addr)) + ; + else + { + clib_warning("unknown input '%U'", format_unformat_error, i); + return -99; + } + + M(SNAT_DET_FORWARD, mp); + clib_memcpy(mp->in_addr, &in_addr, 4); + + S(mp); + W(ret); + return ret; +} + +static void vl_api_snat_det_reverse_reply_t_handler + (vl_api_snat_det_reverse_reply_t *mp) +{ + snat_test_main_t * sm = &snat_test_main; + vat_main_t *vam = sm->vat_main; + i32 retval = ntohl(mp->retval); + + if (retval >= 0) + { + fformat (vam->ofp, "inside address %U\n", format_ip4_address, &mp->in_addr); + } + + vam->retval = retval; + vam->result_ready = 1; +} + +static int api_snat_det_reverse (vat_main_t * vam) +{ + unformat_input_t * i = vam->input; + vl_api_snat_det_reverse_t * mp; + ip4_address_t out_addr; + u32 out_port; + int ret; + + if (unformat (i, "%U %d", unformat_ip4_address, &out_addr, &out_port)) + ; + else + { + clib_warning("unknown input '%U'", format_unformat_error, i); + return -99; + } + + M(SNAT_DET_REVERSE, mp); + clib_memcpy(mp->out_addr, &out_addr, 4); + mp->out_port = htons((u16)out_port); + + S(mp); + W(ret); + return ret; +} + +static void vl_api_snat_det_map_details_t_handler + (vl_api_snat_det_map_details_t *mp) +{ + snat_test_main_t * sm = &snat_test_main; + vat_main_t *vam = sm->vat_main; + + fformat (vam->ofp, "Deterministic S-NAT mapping in %U/%d out %U/%d " + "ports per host %d sharing ratio %d " + "number of sessions %d", + format_ip4_address, mp->in_addr, mp->in_plen, + format_ip4_address, mp->out_addr, mp->out_plen, + ntohs(mp->ports_per_host), ntohl(mp->sharing_ratio), + ntohl(mp->ses_num)); +} + +static int api_snat_det_map_dump(vat_main_t * vam) +{ + vl_api_snat_det_map_dump_t * mp; + vl_api_snat_control_ping_t *mp_ping; + int ret; + + if (vam->json_output) + { + clib_warning ("JSON output not supported for snat_det_map_dump"); + return -99; + } + + M(SNAT_DET_MAP_DUMP, mp); + S(mp); + + /* Use a control ping for synchronization */ + M(SNAT_CONTROL_PING, mp_ping); + S(mp_ping); + + W (ret); + return ret; +} + +static int api_snat_det_set_timeouts (vat_main_t * vam) +{ + unformat_input_t * i = vam->input; + vl_api_snat_det_set_timeouts_t * mp; + u32 udp = SNAT_UDP_TIMEOUT; + u32 tcp_established = SNAT_TCP_ESTABLISHED_TIMEOUT; + u32 tcp_transitory = SNAT_TCP_TRANSITORY_TIMEOUT; + u32 icmp = SNAT_ICMP_TIMEOUT; + int ret; + + if (unformat (i, "udp %d", &udp)) + ; + else if (unformat (i, "tcp_established %d", &tcp_established)) + ; + else if (unformat (i, "tcp_transitory %d", &tcp_transitory)) + ; + else if (unformat (i, "icmp %d", &icmp)) + ; + else + { + clib_warning("unknown input '%U'", format_unformat_error, i); + return -99; + } + + M(SNAT_DET_SET_TIMEOUTS, mp); + mp->udp = htonl(udp); + mp->tcp_established = htonl(tcp_established); + mp->tcp_transitory = htonl(tcp_transitory); + mp->icmp = htonl(icmp); + + S(mp); + W (ret); + return ret; +} + +static void vl_api_snat_det_get_timeouts_reply_t_handler + (vl_api_snat_det_get_timeouts_reply_t *mp) +{ + snat_test_main_t * sm = &snat_test_main; + vat_main_t *vam = sm->vat_main; + i32 retval = ntohl (mp->retval); + + if (retval >= 0) + { + fformat (vam->ofp, "udp timeout: %dsec\n", ntohl (mp->udp)); + fformat (vam->ofp, "tcp-established timeout: %dsec", + ntohl (mp->tcp_established)); + fformat (vam->ofp, "tcp-transitory timeout: %dsec", + ntohl (mp->tcp_transitory)); + fformat (vam->ofp, "icmp timeout: %dsec", ntohl (mp->icmp)); + } + vam->retval = retval; + vam->result_ready = 1; +} + +static int api_snat_det_get_timeouts(vat_main_t * vam) +{ + vl_api_snat_det_get_timeouts_t * mp; + int ret; + + if (vam->json_output) + { + clib_warning ("JSON output not supported for snat_show_config"); + return -99; + } + + M(SNAT_DET_GET_TIMEOUTS, mp); + S(mp); + W (ret); + return ret; +} + +static int api_snat_det_close_session_out (vat_main_t * vam) +{ + unformat_input_t * i = vam->input; + vl_api_snat_det_close_session_out_t * mp; + ip4_address_t out_addr, ext_addr; + u32 out_port, ext_port; + int ret; + + if (unformat (i, "%U:%d %U:%d", + unformat_ip4_address, &out_addr, &out_port, + unformat_ip4_address, &ext_addr, &ext_port)) + ; + else + { + clib_warning("unknown input '%U'", format_unformat_error, i); + return -99; + } + + M(SNAT_DET_CLOSE_SESSION_OUT, mp); + clib_memcpy(mp->out_addr, &out_addr, 4); + mp->out_port = ntohs((u16)out_port); + clib_memcpy(mp->ext_addr, &ext_addr, 4); + mp->ext_port = ntohs((u16)ext_port); + + S(mp); + W (ret); + return ret; +} + +static int api_snat_det_close_session_in (vat_main_t * vam) +{ + unformat_input_t * i = vam->input; + vl_api_snat_det_close_session_in_t * mp; + ip4_address_t in_addr, ext_addr; + u32 in_port, ext_port; + int ret; + + if (unformat (i, "%U:%d %U:%d", + unformat_ip4_address, &in_addr, &in_port, + unformat_ip4_address, &ext_addr, &ext_port)) + ; + else + { + clib_warning("unknown input '%U'", format_unformat_error, i); + return -99; + } + + M(SNAT_DET_CLOSE_SESSION_IN, mp); + clib_memcpy(mp->in_addr, &in_addr, 4); + mp->in_port = ntohs((u16)in_port); + clib_memcpy(mp->ext_addr, &ext_addr, 4); + mp->ext_port = ntohs((u16)ext_port); + + S(mp); + W (ret); + return ret; +} + +static void vl_api_snat_det_session_details_t_handler + (vl_api_snat_det_session_details_t *mp) +{ + snat_test_main_t * sm = &snat_test_main; + vat_main_t *vam = sm->vat_main; + + fformat(vam->ofp, "deterministic session, external host address %U, " + "external host port %d, outer port %d, inside port %d", + format_ip4_address, mp->ext_addr, mp->ext_port, + mp->out_port, mp->in_port); +} + +static int api_snat_det_session_dump(vat_main_t * vam) +{ + unformat_input_t* i = vam->input; + vl_api_snat_det_session_dump_t * mp; + vl_api_snat_control_ping_t *mp_ping; + ip4_address_t user_addr; + int ret; + + if (vam->json_output) + { + clib_warning ("JSON output not supported for snat_det_session_dump"); + return -99; + } + + if (unformat (i, "user_addr %U", unformat_ip4_address, &user_addr)) + ; + else + { + clib_warning ("unknown input '%U'", format_unformat_error, i); + return -99; + } + + M(SNAT_DET_SESSION_DUMP, mp); + clib_memcpy (&mp->user_addr, &user_addr, 4); + S(mp); + + /* Use a control ping for synchronization */ + M(SNAT_CONTROL_PING, mp_ping); + S(mp_ping); + + W (ret); + return ret; +} + +/* + * List of messages that the api test plugin sends, + * and that the data plane plugin processes + */ +#define foreach_vpe_api_msg \ +_(snat_add_address_range, "<start-addr> [- <end-addr] [del]") \ +_(snat_interface_add_del_feature, \ + "<intfc> | sw_if_index <id> [in] [out] [del]") \ +_(snat_add_static_mapping, "local_addr <ip> (external_addr <ip>" \ + " | external_if <intfc> | external_sw_if_ndex <id>) " \ + "[local_port <n>] [external_port <n>] [vrf <table-id>] [del] " \ + "protocol <n>") \ +_(snat_set_workers, "<wokrers_bitmap>") \ +_(snat_static_mapping_dump, "") \ +_(snat_show_config, "") \ +_(snat_address_dump, "") \ +_(snat_interface_dump, "") \ +_(snat_worker_dump, "") \ +_(snat_add_del_interface_addr, \ + "<intfc> | sw_if_index <id> [del]") \ +_(snat_interface_addr_dump, "") \ +_(snat_ipfix_enable_disable, "[domain <id>] [src_port <n>] " \ + "[disable]") \ +_(snat_user_dump, "") \ +_(snat_user_session_dump, "ip_address <ip> vrf_id <table-id>") \ +_(snat_add_det_map, "in <in_addr>/<in_plen> out " \ + "<out_addr>/<out_plen> [del]") \ +_(snat_det_forward, "<in_addr>") \ +_(snat_det_reverse, "<out_addr> <out_port>") \ +_(snat_det_map_dump, "") \ +_(snat_det_set_timeouts, "[udp <sec> | tcp_established <sec> | " \ + "tcp_transitory <sec> | icmp <sec>]") \ +_(snat_det_get_timeouts, "") \ +_(snat_det_close_session_out, "<out_addr>:<out_port> " \ + "<ext_addr>:<ext_port>") \ +_(snat_det_close_session_in, "<in_addr>:<in_port> " \ + "<out_addr>:<out_port>") \ +_(snat_det_session_dump, "ip_address <user_addr>") + +static void +snat_vat_api_hookup (vat_main_t *vam) +{ + snat_test_main_t * sm __attribute__((unused)) = &snat_test_main; + /* Hook up handlers for replies from the data plane plug-in */ +#define _(N,n) \ + vl_msg_api_set_handlers((VL_API_##N + sm->msg_id_base), \ + #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_vpe_api_reply_msg; +#undef _ + + /* API messages we can send */ +#define _(n,h) \ + hash_set_mem (vam->function_by_name, #n, api_##n); + foreach_vpe_api_msg; +#undef _ + + /* Help strings */ +#define _(n,h) hash_set_mem (vam->help_by_name, #n, h); + foreach_vpe_api_msg; +#undef _ +} + +clib_error_t * vat_plugin_register (vat_main_t *vam) +{ + snat_test_main_t * sm = &snat_test_main; + u8 * name; + + sm->vat_main = vam; + + /* Ask the vpp engine for the first assigned message-id */ + name = format (0, "snat_%08x%c", api_version, 0); + sm->msg_id_base = vl_client_get_first_plugin_msg_id ((char *) name); + + if (sm->msg_id_base != (u16) ~0) + snat_vat_api_hookup (vam); + + vec_free(name); + + return 0; +} diff --git a/src/plugins/nat/out2in.c b/src/plugins/nat/out2in.c new file mode 100755 index 00000000..f250136b --- /dev/null +++ b/src/plugins/nat/out2in.c @@ -0,0 +1,2514 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vlib/vlib.h> +#include <vnet/vnet.h> +#include <vnet/pg/pg.h> +#include <vnet/handoff.h> + +#include <vnet/ip/ip.h> +#include <vnet/udp/udp.h> +#include <vnet/ethernet/ethernet.h> +#include <vnet/fib/ip4_fib.h> +#include <nat/nat.h> +#include <nat/nat_ipfix_logging.h> +#include <nat/nat_det.h> + +#include <vppinfra/hash.h> +#include <vppinfra/error.h> +#include <vppinfra/elog.h> + +typedef struct { + u32 sw_if_index; + u32 next_index; + u32 session_index; +} snat_out2in_trace_t; + +typedef struct { + u32 next_worker_index; + u8 do_handoff; +} snat_out2in_worker_handoff_trace_t; + +/* packet trace format function */ +static u8 * format_snat_out2in_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + snat_out2in_trace_t * t = va_arg (*args, snat_out2in_trace_t *); + + s = format (s, "NAT44_OUT2IN: sw_if_index %d, next index %d, session index %d", + t->sw_if_index, t->next_index, t->session_index); + return s; +} + +static u8 * format_snat_out2in_fast_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + snat_out2in_trace_t * t = va_arg (*args, snat_out2in_trace_t *); + + s = format (s, "NAT44_OUT2IN_FAST: sw_if_index %d, next index %d", + t->sw_if_index, t->next_index); + return s; +} + +static u8 * format_snat_out2in_worker_handoff_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + snat_out2in_worker_handoff_trace_t * t = + va_arg (*args, snat_out2in_worker_handoff_trace_t *); + char * m; + + m = t->do_handoff ? "next worker" : "same worker"; + s = format (s, "NAT44_OUT2IN_WORKER_HANDOFF: %s %d", m, t->next_worker_index); + + return s; +} + +vlib_node_registration_t snat_out2in_node; +vlib_node_registration_t snat_out2in_fast_node; +vlib_node_registration_t snat_out2in_worker_handoff_node; +vlib_node_registration_t snat_det_out2in_node; + +#define foreach_snat_out2in_error \ +_(UNSUPPORTED_PROTOCOL, "Unsupported protocol") \ +_(OUT2IN_PACKETS, "Good out2in packets processed") \ +_(BAD_ICMP_TYPE, "unsupported ICMP type") \ +_(NO_TRANSLATION, "No translation") \ +_(MAX_SESSIONS_EXCEEDED, "Maximum sessions exceeded") + +typedef enum { +#define _(sym,str) SNAT_OUT2IN_ERROR_##sym, + foreach_snat_out2in_error +#undef _ + SNAT_OUT2IN_N_ERROR, +} snat_out2in_error_t; + +static char * snat_out2in_error_strings[] = { +#define _(sym,string) string, + foreach_snat_out2in_error +#undef _ +}; + +typedef enum { + SNAT_OUT2IN_NEXT_DROP, + SNAT_OUT2IN_NEXT_LOOKUP, + SNAT_OUT2IN_NEXT_ICMP_ERROR, + SNAT_OUT2IN_N_NEXT, +} snat_out2in_next_t; + +/** + * @brief Create session for static mapping. + * + * Create NAT session initiated by host from external network with static + * mapping. + * + * @param sm NAT main. + * @param b0 Vlib buffer. + * @param in2out In2out NAT44 session key. + * @param out2in Out2in NAT44 session key. + * @param node Vlib node. + * + * @returns SNAT session if successfully created otherwise 0. + */ +static inline snat_session_t * +create_session_for_static_mapping (snat_main_t *sm, + vlib_buffer_t *b0, + snat_session_key_t in2out, + snat_session_key_t out2in, + vlib_node_runtime_t * node, + u32 thread_index) +{ + snat_user_t *u; + snat_user_key_t user_key; + snat_session_t *s; + clib_bihash_kv_8_8_t kv0, value0; + dlist_elt_t * per_user_translation_list_elt; + dlist_elt_t * per_user_list_head_elt; + ip4_header_t *ip0; + + if (PREDICT_FALSE (maximum_sessions_exceeded(sm, thread_index))) + { + b0->error = node->errors[SNAT_OUT2IN_ERROR_MAX_SESSIONS_EXCEEDED]; + return 0; + } + + ip0 = vlib_buffer_get_current (b0); + + user_key.addr = in2out.addr; + user_key.fib_index = in2out.fib_index; + kv0.key = user_key.as_u64; + + /* Ever heard of the "user" = inside ip4 address before? */ + if (clib_bihash_search_8_8 (&sm->per_thread_data[thread_index].user_hash, + &kv0, &value0)) + { + /* no, make a new one */ + pool_get (sm->per_thread_data[thread_index].users, u); + memset (u, 0, sizeof (*u)); + u->addr = in2out.addr; + u->fib_index = in2out.fib_index; + + pool_get (sm->per_thread_data[thread_index].list_pool, + per_user_list_head_elt); + + u->sessions_per_user_list_head_index = per_user_list_head_elt - + sm->per_thread_data[thread_index].list_pool; + + clib_dlist_init (sm->per_thread_data[thread_index].list_pool, + u->sessions_per_user_list_head_index); + + kv0.value = u - sm->per_thread_data[thread_index].users; + + /* add user */ + clib_bihash_add_del_8_8 (&sm->per_thread_data[thread_index].user_hash, + &kv0, 1 /* is_add */); + } + else + { + u = pool_elt_at_index (sm->per_thread_data[thread_index].users, + value0.value); + } + + pool_get (sm->per_thread_data[thread_index].sessions, s); + memset (s, 0, sizeof (*s)); + + s->outside_address_index = ~0; + s->flags |= SNAT_SESSION_FLAG_STATIC_MAPPING; + s->ext_host_addr.as_u32 = ip0->dst_address.as_u32; + u->nstaticsessions++; + + /* Create list elts */ + pool_get (sm->per_thread_data[thread_index].list_pool, + per_user_translation_list_elt); + clib_dlist_init (sm->per_thread_data[thread_index].list_pool, + per_user_translation_list_elt - + sm->per_thread_data[thread_index].list_pool); + + per_user_translation_list_elt->value = + s - sm->per_thread_data[thread_index].sessions; + s->per_user_index = + per_user_translation_list_elt - sm->per_thread_data[thread_index].list_pool; + s->per_user_list_head_index = u->sessions_per_user_list_head_index; + + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, + s->per_user_list_head_index, + per_user_translation_list_elt - + sm->per_thread_data[thread_index].list_pool); + + s->in2out = in2out; + s->out2in = out2in; + s->in2out.protocol = out2in.protocol; + + /* Add to translation hashes */ + kv0.key = s->in2out.as_u64; + kv0.value = s - sm->per_thread_data[thread_index].sessions; + if (clib_bihash_add_del_8_8 (&sm->per_thread_data[thread_index].in2out, &kv0, + 1 /* is_add */)) + clib_warning ("in2out key add failed"); + + kv0.key = s->out2in.as_u64; + kv0.value = s - sm->per_thread_data[thread_index].sessions; + + if (clib_bihash_add_del_8_8 (&sm->per_thread_data[thread_index].out2in, &kv0, + 1 /* is_add */)) + clib_warning ("out2in key add failed"); + + /* log NAT event */ + snat_ipfix_logging_nat44_ses_create(s->in2out.addr.as_u32, + s->out2in.addr.as_u32, + s->in2out.protocol, + s->in2out.port, + s->out2in.port, + s->in2out.fib_index); + return s; +} + +static_always_inline +snat_out2in_error_t icmp_get_key(ip4_header_t *ip0, + snat_session_key_t *p_key0) +{ + icmp46_header_t *icmp0; + snat_session_key_t key0; + icmp_echo_header_t *echo0, *inner_echo0 = 0; + ip4_header_t *inner_ip0; + void *l4_header = 0; + icmp46_header_t *inner_icmp0; + + icmp0 = (icmp46_header_t *) ip4_next_header (ip0); + echo0 = (icmp_echo_header_t *)(icmp0+1); + + if (!icmp_is_error_message (icmp0)) + { + key0.protocol = SNAT_PROTOCOL_ICMP; + key0.addr = ip0->dst_address; + key0.port = echo0->identifier; + } + else + { + inner_ip0 = (ip4_header_t *)(echo0+1); + l4_header = ip4_next_header (inner_ip0); + key0.protocol = ip_proto_to_snat_proto (inner_ip0->protocol); + key0.addr = inner_ip0->src_address; + switch (key0.protocol) + { + case SNAT_PROTOCOL_ICMP: + inner_icmp0 = (icmp46_header_t*)l4_header; + inner_echo0 = (icmp_echo_header_t *)(inner_icmp0+1); + key0.port = inner_echo0->identifier; + break; + case SNAT_PROTOCOL_UDP: + case SNAT_PROTOCOL_TCP: + key0.port = ((tcp_udp_header_t*)l4_header)->src_port; + break; + default: + return SNAT_OUT2IN_ERROR_UNSUPPORTED_PROTOCOL; + } + } + *p_key0 = key0; + return -1; /* success */ +} + +/** + * Get address and port values to be used for ICMP packet translation + * and create session if needed + * + * @param[in,out] sm NAT main + * @param[in,out] node NAT node runtime + * @param[in] thread_index thread index + * @param[in,out] b0 buffer containing packet to be translated + * @param[out] p_proto protocol used for matching + * @param[out] p_value address and port after NAT translation + * @param[out] p_dont_translate if packet should not be translated + * @param d optional parameter + * @param e optional parameter + */ +u32 icmp_match_out2in_slow(snat_main_t *sm, vlib_node_runtime_t *node, + u32 thread_index, vlib_buffer_t *b0, + ip4_header_t *ip0, u8 *p_proto, + snat_session_key_t *p_value, + u8 *p_dont_translate, void *d, void *e) +{ + icmp46_header_t *icmp0; + u32 sw_if_index0; + u32 rx_fib_index0; + snat_session_key_t key0; + snat_session_key_t sm0; + snat_session_t *s0 = 0; + u8 dont_translate = 0; + clib_bihash_kv_8_8_t kv0, value0; + u8 is_addr_only; + u32 next0 = ~0; + int err; + + icmp0 = (icmp46_header_t *) ip4_next_header (ip0); + sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX]; + rx_fib_index0 = ip4_fib_table_get_index_for_sw_if_index (sw_if_index0); + + key0.protocol = 0; + + err = icmp_get_key (ip0, &key0); + if (err != -1) + { + b0->error = node->errors[SNAT_OUT2IN_ERROR_UNSUPPORTED_PROTOCOL]; + next0 = SNAT_OUT2IN_NEXT_DROP; + goto out; + } + key0.fib_index = rx_fib_index0; + + kv0.key = key0.as_u64; + + if (clib_bihash_search_8_8 (&sm->per_thread_data[thread_index].out2in, &kv0, + &value0)) + { + /* Try to match static mapping by external address and port, + destination address and port in packet */ + if (snat_static_mapping_match(sm, key0, &sm0, 1, &is_addr_only)) + { + /* Don't NAT packet aimed at the intfc address */ + if (PREDICT_FALSE(is_interface_addr(sm, node, sw_if_index0, + ip0->dst_address.as_u32))) + { + dont_translate = 1; + goto out; + } + b0->error = node->errors[SNAT_OUT2IN_ERROR_NO_TRANSLATION]; + next0 = SNAT_OUT2IN_NEXT_DROP; + goto out; + } + + if (PREDICT_FALSE(icmp0->type != ICMP4_echo_reply && + (icmp0->type != ICMP4_echo_request || !is_addr_only))) + { + b0->error = node->errors[SNAT_OUT2IN_ERROR_BAD_ICMP_TYPE]; + next0 = SNAT_OUT2IN_NEXT_DROP; + goto out; + } + + /* Create session initiated by host from external network */ + s0 = create_session_for_static_mapping(sm, b0, sm0, key0, + node, thread_index); + + if (!s0) + { + next0 = SNAT_OUT2IN_NEXT_DROP; + goto out; + } + } + else + { + if (PREDICT_FALSE(icmp0->type != ICMP4_echo_reply && + icmp0->type != ICMP4_echo_request && + !icmp_is_error_message (icmp0))) + { + b0->error = node->errors[SNAT_OUT2IN_ERROR_BAD_ICMP_TYPE]; + next0 = SNAT_OUT2IN_NEXT_DROP; + goto out; + } + + s0 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, + value0.value); + } + +out: + *p_proto = key0.protocol; + if (s0) + *p_value = s0->in2out; + *p_dont_translate = dont_translate; + if (d) + *(snat_session_t**)d = s0; + return next0; +} + +/** + * Get address and port values to be used for ICMP packet translation + * + * @param[in] sm NAT main + * @param[in,out] node NAT node runtime + * @param[in] thread_index thread index + * @param[in,out] b0 buffer containing packet to be translated + * @param[out] p_proto protocol used for matching + * @param[out] p_value address and port after NAT translation + * @param[out] p_dont_translate if packet should not be translated + * @param d optional parameter + * @param e optional parameter + */ +u32 icmp_match_out2in_fast(snat_main_t *sm, vlib_node_runtime_t *node, + u32 thread_index, vlib_buffer_t *b0, + ip4_header_t *ip0, u8 *p_proto, + snat_session_key_t *p_value, + u8 *p_dont_translate, void *d, void *e) +{ + icmp46_header_t *icmp0; + u32 sw_if_index0; + u32 rx_fib_index0; + snat_session_key_t key0; + snat_session_key_t sm0; + u8 dont_translate = 0; + u8 is_addr_only; + u32 next0 = ~0; + int err; + + icmp0 = (icmp46_header_t *) ip4_next_header (ip0); + sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX]; + rx_fib_index0 = ip4_fib_table_get_index_for_sw_if_index (sw_if_index0); + + err = icmp_get_key (ip0, &key0); + if (err != -1) + { + b0->error = node->errors[err]; + next0 = SNAT_OUT2IN_NEXT_DROP; + goto out2; + } + key0.fib_index = rx_fib_index0; + + if (snat_static_mapping_match(sm, key0, &sm0, 1, &is_addr_only)) + { + /* Don't NAT packet aimed at the intfc address */ + if (is_interface_addr(sm, node, sw_if_index0, ip0->dst_address.as_u32)) + { + dont_translate = 1; + goto out; + } + b0->error = node->errors[SNAT_OUT2IN_ERROR_NO_TRANSLATION]; + next0 = SNAT_OUT2IN_NEXT_DROP; + goto out; + } + + if (PREDICT_FALSE(icmp0->type != ICMP4_echo_reply && + (icmp0->type != ICMP4_echo_request || !is_addr_only) && + !icmp_is_error_message (icmp0))) + { + b0->error = node->errors[SNAT_OUT2IN_ERROR_BAD_ICMP_TYPE]; + next0 = SNAT_OUT2IN_NEXT_DROP; + goto out; + } + +out: + *p_value = sm0; +out2: + *p_proto = key0.protocol; + *p_dont_translate = dont_translate; + return next0; +} + +static inline u32 icmp_out2in (snat_main_t *sm, + vlib_buffer_t * b0, + ip4_header_t * ip0, + icmp46_header_t * icmp0, + u32 sw_if_index0, + u32 rx_fib_index0, + vlib_node_runtime_t * node, + u32 next0, + u32 thread_index, + void *d, + void *e) +{ + snat_session_key_t sm0; + u8 protocol; + icmp_echo_header_t *echo0, *inner_echo0 = 0; + ip4_header_t *inner_ip0 = 0; + void *l4_header = 0; + icmp46_header_t *inner_icmp0; + u8 dont_translate; + u32 new_addr0, old_addr0; + u16 old_id0, new_id0; + ip_csum_t sum0; + u16 checksum0; + u32 next0_tmp; + + echo0 = (icmp_echo_header_t *)(icmp0+1); + + next0_tmp = sm->icmp_match_out2in_cb(sm, node, thread_index, b0, ip0, + &protocol, &sm0, &dont_translate, d, e); + if (next0_tmp != ~0) + next0 = next0_tmp; + if (next0 == SNAT_OUT2IN_NEXT_DROP || dont_translate) + goto out; + + sum0 = ip_incremental_checksum (0, icmp0, + ntohs(ip0->length) - ip4_header_bytes (ip0)); + checksum0 = ~ip_csum_fold (sum0); + if (checksum0 != 0 && checksum0 != 0xffff) + { + next0 = SNAT_OUT2IN_NEXT_DROP; + goto out; + } + + old_addr0 = ip0->dst_address.as_u32; + new_addr0 = ip0->dst_address.as_u32 = sm0.addr.as_u32; + vnet_buffer(b0)->sw_if_index[VLIB_TX] = sm0.fib_index; + + sum0 = ip0->checksum; + sum0 = ip_csum_update (sum0, old_addr0, new_addr0, ip4_header_t, + dst_address /* changed member */); + ip0->checksum = ip_csum_fold (sum0); + + if (!icmp_is_error_message (icmp0)) + { + new_id0 = sm0.port; + if (PREDICT_FALSE(new_id0 != echo0->identifier)) + { + old_id0 = echo0->identifier; + new_id0 = sm0.port; + echo0->identifier = new_id0; + + sum0 = icmp0->checksum; + sum0 = ip_csum_update (sum0, old_id0, new_id0, icmp_echo_header_t, + identifier /* changed member */); + icmp0->checksum = ip_csum_fold (sum0); + } + } + else + { + inner_ip0 = (ip4_header_t *)(echo0+1); + l4_header = ip4_next_header (inner_ip0); + + if (!ip4_header_checksum_is_valid (inner_ip0)) + { + next0 = SNAT_OUT2IN_NEXT_DROP; + goto out; + } + + old_addr0 = inner_ip0->src_address.as_u32; + inner_ip0->src_address = sm0.addr; + new_addr0 = inner_ip0->src_address.as_u32; + + sum0 = icmp0->checksum; + sum0 = ip_csum_update (sum0, old_addr0, new_addr0, ip4_header_t, + src_address /* changed member */); + icmp0->checksum = ip_csum_fold (sum0); + + switch (protocol) + { + case SNAT_PROTOCOL_ICMP: + inner_icmp0 = (icmp46_header_t*)l4_header; + inner_echo0 = (icmp_echo_header_t *)(inner_icmp0+1); + + old_id0 = inner_echo0->identifier; + new_id0 = sm0.port; + inner_echo0->identifier = new_id0; + + sum0 = icmp0->checksum; + sum0 = ip_csum_update (sum0, old_id0, new_id0, icmp_echo_header_t, + identifier); + icmp0->checksum = ip_csum_fold (sum0); + break; + case SNAT_PROTOCOL_UDP: + case SNAT_PROTOCOL_TCP: + old_id0 = ((tcp_udp_header_t*)l4_header)->src_port; + new_id0 = sm0.port; + ((tcp_udp_header_t*)l4_header)->src_port = new_id0; + + sum0 = icmp0->checksum; + sum0 = ip_csum_update (sum0, old_id0, new_id0, tcp_udp_header_t, + src_port); + icmp0->checksum = ip_csum_fold (sum0); + break; + default: + ASSERT(0); + } + } + +out: + return next0; +} + + +static inline u32 icmp_out2in_slow_path (snat_main_t *sm, + vlib_buffer_t * b0, + ip4_header_t * ip0, + icmp46_header_t * icmp0, + u32 sw_if_index0, + u32 rx_fib_index0, + vlib_node_runtime_t * node, + u32 next0, f64 now, + u32 thread_index, + snat_session_t ** p_s0) +{ + next0 = icmp_out2in(sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, node, + next0, thread_index, p_s0, 0); + snat_session_t * s0 = *p_s0; + if (PREDICT_TRUE(next0 != SNAT_OUT2IN_NEXT_DROP && s0)) + { + /* Accounting */ + s0->last_heard = now; + s0->total_pkts++; + s0->total_bytes += vlib_buffer_length_in_chain (sm->vlib_main, b0); + /* Per-user LRU list maintenance for dynamic translation */ + if (!snat_is_session_static (s0)) + { + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, + s0->per_user_index); + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, + s0->per_user_list_head_index, + s0->per_user_index); + } + } + return next0; +} + +static snat_session_t * +snat_out2in_unknown_proto (snat_main_t *sm, + vlib_buffer_t * b, + ip4_header_t * ip, + u32 rx_fib_index, + u32 thread_index, + f64 now, + vlib_main_t * vm, + vlib_node_runtime_t * node) +{ + clib_bihash_kv_8_8_t kv, value; + clib_bihash_kv_16_8_t s_kv, s_value; + snat_static_mapping_t *m; + snat_session_key_t m_key; + u32 old_addr, new_addr; + ip_csum_t sum; + nat_ed_ses_key_t key; + snat_session_t * s; + snat_main_per_thread_data_t *tsm = &sm->per_thread_data[thread_index]; + snat_user_key_t u_key; + snat_user_t *u; + dlist_elt_t *head, *elt; + + old_addr = ip->dst_address.as_u32; + + key.l_addr = ip->dst_address; + key.r_addr = ip->src_address; + key.fib_index = rx_fib_index; + key.proto = ip->protocol; + key.rsvd = 0; + key.l_port = 0; + s_kv.key[0] = key.as_u64[0]; + s_kv.key[1] = key.as_u64[1]; + + if (!clib_bihash_search_16_8 (&sm->out2in_ed, &s_kv, &s_value)) + { + s = pool_elt_at_index (tsm->sessions, s_value.value); + new_addr = ip->dst_address.as_u32 = s->in2out.addr.as_u32; + } + else + { + if (PREDICT_FALSE (maximum_sessions_exceeded(sm, thread_index))) + { + b->error = node->errors[SNAT_OUT2IN_ERROR_MAX_SESSIONS_EXCEEDED]; + return 0; + } + + m_key.addr = ip->dst_address; + m_key.port = 0; + m_key.protocol = 0; + m_key.fib_index = rx_fib_index; + kv.key = m_key.as_u64; + if (clib_bihash_search_8_8 (&sm->static_mapping_by_external, &kv, &value)) + { + b->error = node->errors[SNAT_OUT2IN_ERROR_NO_TRANSLATION]; + return 0; + } + + m = pool_elt_at_index (sm->static_mappings, value.value); + + new_addr = ip->dst_address.as_u32 = m->local_addr.as_u32; + + u_key.addr = ip->src_address; + u_key.fib_index = m->fib_index; + kv.key = u_key.as_u64; + + /* Ever heard of the "user" = src ip4 address before? */ + if (clib_bihash_search_8_8 (&tsm->user_hash, &kv, &value)) + { + /* no, make a new one */ + pool_get (tsm->users, u); + memset (u, 0, sizeof (*u)); + u->addr = ip->src_address; + u->fib_index = rx_fib_index; + + pool_get (tsm->list_pool, head); + u->sessions_per_user_list_head_index = head - tsm->list_pool; + + clib_dlist_init (tsm->list_pool, + u->sessions_per_user_list_head_index); + + kv.value = u - tsm->users; + + /* add user */ + clib_bihash_add_del_8_8 (&tsm->user_hash, &kv, 1); + } + else + { + u = pool_elt_at_index (tsm->users, value.value); + } + + /* Create a new session */ + pool_get (tsm->sessions, s); + memset (s, 0, sizeof (*s)); + + s->ext_host_addr.as_u32 = ip->src_address.as_u32; + s->flags |= SNAT_SESSION_FLAG_UNKNOWN_PROTO; + s->flags |= SNAT_SESSION_FLAG_STATIC_MAPPING; + s->outside_address_index = ~0; + s->out2in.addr.as_u32 = old_addr; + s->out2in.fib_index = rx_fib_index; + s->in2out.addr.as_u32 = new_addr; + s->in2out.fib_index = m->fib_index; + s->in2out.port = s->out2in.port = ip->protocol; + u->nstaticsessions++; + + /* Create list elts */ + pool_get (tsm->list_pool, elt); + clib_dlist_init (tsm->list_pool, elt - tsm->list_pool); + elt->value = s - tsm->sessions; + s->per_user_index = elt - tsm->list_pool; + s->per_user_list_head_index = u->sessions_per_user_list_head_index; + clib_dlist_addtail (tsm->list_pool, s->per_user_list_head_index, + s->per_user_index); + + /* Add to lookup tables */ + s_kv.value = s - tsm->sessions; + if (clib_bihash_add_del_16_8 (&sm->out2in_ed, &s_kv, 1)) + clib_warning ("out2in key add failed"); + + key.l_addr = ip->dst_address; + key.fib_index = m->fib_index; + s_kv.key[0] = key.as_u64[0]; + s_kv.key[1] = key.as_u64[1]; + if (clib_bihash_add_del_16_8 (&sm->in2out_ed, &s_kv, 1)) + clib_warning ("in2out key add failed"); + } + + /* Update IP checksum */ + sum = ip->checksum; + sum = ip_csum_update (sum, old_addr, new_addr, ip4_header_t, dst_address); + ip->checksum = ip_csum_fold (sum); + + vnet_buffer(b)->sw_if_index[VLIB_TX] = s->in2out.fib_index; + + /* Accounting */ + s->last_heard = now; + s->total_pkts++; + s->total_bytes += vlib_buffer_length_in_chain (vm, b); + /* Per-user LRU list maintenance */ + clib_dlist_remove (tsm->list_pool, s->per_user_index); + clib_dlist_addtail (tsm->list_pool, s->per_user_list_head_index, + s->per_user_index); + + return s; +} + +static snat_session_t * +snat_out2in_lb (snat_main_t *sm, + vlib_buffer_t * b, + ip4_header_t * ip, + u32 rx_fib_index, + u32 thread_index, + f64 now, + vlib_main_t * vm, + vlib_node_runtime_t * node) +{ + nat_ed_ses_key_t key; + clib_bihash_kv_16_8_t s_kv, s_value; + udp_header_t *udp = ip4_next_header (ip); + tcp_header_t *tcp = (tcp_header_t *) udp; + snat_session_t *s = 0; + snat_main_per_thread_data_t *tsm = &sm->per_thread_data[thread_index]; + snat_session_key_t e_key, l_key; + clib_bihash_kv_8_8_t kv, value; + u32 old_addr, new_addr; + u32 proto = ip_proto_to_snat_proto (ip->protocol); + u16 new_port, old_port; + ip_csum_t sum; + snat_user_key_t u_key; + snat_user_t *u; + dlist_elt_t *head, *elt; + + old_addr = ip->dst_address.as_u32; + + key.l_addr = ip->dst_address; + key.r_addr = ip->src_address; + key.fib_index = rx_fib_index; + key.proto = ip->protocol; + key.rsvd = 0; + key.l_port = udp->dst_port; + s_kv.key[0] = key.as_u64[0]; + s_kv.key[1] = key.as_u64[1]; + + if (!clib_bihash_search_16_8 (&sm->out2in_ed, &s_kv, &s_value)) + { + s = pool_elt_at_index (tsm->sessions, s_value.value); + } + else + { + if (PREDICT_FALSE (maximum_sessions_exceeded(sm, thread_index))) + { + b->error = node->errors[SNAT_OUT2IN_ERROR_MAX_SESSIONS_EXCEEDED]; + return 0; + } + + e_key.addr = ip->dst_address; + e_key.port = udp->dst_port; + e_key.protocol = proto; + e_key.fib_index = rx_fib_index; + if (snat_static_mapping_match(sm, e_key, &l_key, 1, 0)) + return 0; + + u_key.addr = l_key.addr; + u_key.fib_index = l_key.fib_index; + kv.key = u_key.as_u64; + + /* Ever heard of the "user" = src ip4 address before? */ + if (clib_bihash_search_8_8 (&tsm->user_hash, &kv, &value)) + { + /* no, make a new one */ + pool_get (tsm->users, u); + memset (u, 0, sizeof (*u)); + u->addr = l_key.addr; + u->fib_index = l_key.fib_index; + + pool_get (tsm->list_pool, head); + u->sessions_per_user_list_head_index = head - tsm->list_pool; + + clib_dlist_init (tsm->list_pool, + u->sessions_per_user_list_head_index); + + kv.value = u - tsm->users; + + /* add user */ + if (clib_bihash_add_del_8_8 (&tsm->user_hash, &kv, 1)) + clib_warning ("user key add failed"); + } + else + { + u = pool_elt_at_index (tsm->users, value.value); + } + + /* Create a new session */ + pool_get (tsm->sessions, s); + memset (s, 0, sizeof (*s)); + + s->ext_host_addr.as_u32 = ip->src_address.as_u32; + s->flags |= SNAT_SESSION_FLAG_STATIC_MAPPING; + s->flags |= SNAT_SESSION_FLAG_LOAD_BALANCING; + s->outside_address_index = ~0; + s->out2in = e_key; + s->in2out = l_key; + u->nstaticsessions++; + + /* Create list elts */ + pool_get (tsm->list_pool, elt); + clib_dlist_init (tsm->list_pool, elt - tsm->list_pool); + elt->value = s - tsm->sessions; + s->per_user_index = elt - tsm->list_pool; + s->per_user_list_head_index = u->sessions_per_user_list_head_index; + clib_dlist_addtail (tsm->list_pool, s->per_user_list_head_index, + s->per_user_index); + + /* Add to lookup tables */ + s_kv.value = s - tsm->sessions; + if (clib_bihash_add_del_16_8 (&sm->out2in_ed, &s_kv, 1)) + clib_warning ("out2in-ed key add failed"); + + key.l_addr = l_key.addr; + key.fib_index = l_key.fib_index; + key.l_port = l_key.port; + s_kv.key[0] = key.as_u64[0]; + s_kv.key[1] = key.as_u64[1]; + if (clib_bihash_add_del_16_8 (&sm->in2out_ed, &s_kv, 1)) + clib_warning ("in2out-ed key add failed"); + } + + new_addr = ip->dst_address.as_u32 = s->in2out.addr.as_u32; + + /* Update IP checksum */ + sum = ip->checksum; + sum = ip_csum_update (sum, old_addr, new_addr, ip4_header_t, dst_address); + ip->checksum = ip_csum_fold (sum); + + if (PREDICT_TRUE(proto == SNAT_PROTOCOL_TCP)) + { + old_port = tcp->dst_port; + tcp->dst_port = s->in2out.port; + new_port = tcp->dst_port; + + sum = tcp->checksum; + sum = ip_csum_update (sum, old_addr, new_addr, ip4_header_t, dst_address); + sum = ip_csum_update (sum, old_port, new_port, ip4_header_t, length); + tcp->checksum = ip_csum_fold(sum); + } + else + { + udp->dst_port = s->in2out.port; + udp->checksum = 0; + } + + vnet_buffer(b)->sw_if_index[VLIB_TX] = s->in2out.fib_index; + + /* Accounting */ + s->last_heard = now; + s->total_pkts++; + s->total_bytes += vlib_buffer_length_in_chain (vm, b); + return s; +} + +static uword +snat_out2in_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + u32 n_left_from, * from, * to_next; + snat_out2in_next_t next_index; + u32 pkts_processed = 0; + snat_main_t * sm = &snat_main; + f64 now = vlib_time_now (vm); + u32 thread_index = vlib_get_thread_index (); + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, + to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next >= 2) + { + u32 bi0, bi1; + vlib_buffer_t * b0, * b1; + u32 next0 = SNAT_OUT2IN_NEXT_LOOKUP; + u32 next1 = SNAT_OUT2IN_NEXT_LOOKUP; + u32 sw_if_index0, sw_if_index1; + ip4_header_t * ip0, *ip1; + ip_csum_t sum0, sum1; + u32 new_addr0, old_addr0; + u16 new_port0, old_port0; + u32 new_addr1, old_addr1; + u16 new_port1, old_port1; + udp_header_t * udp0, * udp1; + tcp_header_t * tcp0, * tcp1; + icmp46_header_t * icmp0, * icmp1; + snat_session_key_t key0, key1, sm0, sm1; + u32 rx_fib_index0, rx_fib_index1; + u32 proto0, proto1; + snat_session_t * s0 = 0, * s1 = 0; + clib_bihash_kv_8_8_t kv0, kv1, value0, value1; + + /* Prefetch next iteration. */ + { + vlib_buffer_t * p2, * p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + + CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE); + } + + /* speculatively enqueue b0 and b1 to the current next frame */ + to_next[0] = bi0 = from[0]; + to_next[1] = bi1 = from[1]; + from += 2; + to_next += 2; + n_left_from -= 2; + n_left_to_next -= 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + vnet_buffer (b0)->snat.flags = 0; + vnet_buffer (b1)->snat.flags = 0; + + ip0 = vlib_buffer_get_current (b0); + udp0 = ip4_next_header (ip0); + tcp0 = (tcp_header_t *) udp0; + icmp0 = (icmp46_header_t *) udp0; + + sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX]; + rx_fib_index0 = vec_elt (sm->ip4_main->fib_index_by_sw_if_index, + sw_if_index0); + + if (PREDICT_FALSE(ip0->ttl == 1)) + { + vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; + icmp4_error_set_vnet_buffer (b0, ICMP4_time_exceeded, + ICMP4_time_exceeded_ttl_exceeded_in_transit, + 0); + next0 = SNAT_OUT2IN_NEXT_ICMP_ERROR; + goto trace0; + } + + proto0 = ip_proto_to_snat_proto (ip0->protocol); + + if (PREDICT_FALSE (proto0 == ~0)) + { + s0 = snat_out2in_unknown_proto(sm, b0, ip0, rx_fib_index0, + thread_index, now, vm, node); + if (!s0) + next0 = SNAT_OUT2IN_NEXT_DROP; + goto trace0; + } + + if (PREDICT_FALSE (proto0 == SNAT_PROTOCOL_ICMP)) + { + next0 = icmp_out2in_slow_path + (sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, node, + next0, now, thread_index, &s0); + goto trace0; + } + + key0.addr = ip0->dst_address; + key0.port = udp0->dst_port; + key0.protocol = proto0; + key0.fib_index = rx_fib_index0; + + kv0.key = key0.as_u64; + + if (clib_bihash_search_8_8 (&sm->per_thread_data[thread_index].out2in, + &kv0, &value0)) + { + /* Try to match static mapping by external address and port, + destination address and port in packet */ + if (snat_static_mapping_match(sm, key0, &sm0, 1, 0)) + { + b0->error = node->errors[SNAT_OUT2IN_ERROR_NO_TRANSLATION]; + /* + * Send DHCP packets to the ipv4 stack, or we won't + * be able to use dhcp client on the outside interface + */ + if (proto0 != SNAT_PROTOCOL_UDP + || (udp0->dst_port + != clib_host_to_net_u16(UDP_DST_PORT_dhcp_to_client))) + next0 = SNAT_OUT2IN_NEXT_DROP; + goto trace0; + } + + /* Create session initiated by host from external network */ + s0 = create_session_for_static_mapping(sm, b0, sm0, key0, node, + thread_index); + if (!s0) + { + next0 = SNAT_OUT2IN_NEXT_DROP; + goto trace0; + } + } + else + { + if (PREDICT_FALSE (value0.value == ~0ULL)) + { + s0 = snat_out2in_lb(sm, b0, ip0, rx_fib_index0, thread_index, + now, vm, node); + if (!s0) + next0 = SNAT_OUT2IN_NEXT_DROP; + goto trace0; + } + else + { + s0 = pool_elt_at_index ( + sm->per_thread_data[thread_index].sessions, + value0.value); + } + } + + old_addr0 = ip0->dst_address.as_u32; + ip0->dst_address = s0->in2out.addr; + new_addr0 = ip0->dst_address.as_u32; + vnet_buffer(b0)->sw_if_index[VLIB_TX] = s0->in2out.fib_index; + + sum0 = ip0->checksum; + sum0 = ip_csum_update (sum0, old_addr0, new_addr0, + ip4_header_t, + dst_address /* changed member */); + ip0->checksum = ip_csum_fold (sum0); + + if (PREDICT_TRUE(proto0 == SNAT_PROTOCOL_TCP)) + { + old_port0 = tcp0->dst_port; + tcp0->dst_port = s0->in2out.port; + new_port0 = tcp0->dst_port; + + sum0 = tcp0->checksum; + sum0 = ip_csum_update (sum0, old_addr0, new_addr0, + ip4_header_t, + dst_address /* changed member */); + + sum0 = ip_csum_update (sum0, old_port0, new_port0, + ip4_header_t /* cheat */, + length /* changed member */); + tcp0->checksum = ip_csum_fold(sum0); + } + else + { + old_port0 = udp0->dst_port; + udp0->dst_port = s0->in2out.port; + udp0->checksum = 0; + } + + /* Accounting */ + s0->last_heard = now; + s0->total_pkts++; + s0->total_bytes += vlib_buffer_length_in_chain (vm, b0); + /* Per-user LRU list maintenance for dynamic translation */ + if (!snat_is_session_static (s0)) + { + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, + s0->per_user_index); + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, + s0->per_user_list_head_index, + s0->per_user_index); + } + trace0: + + if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE) + && (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + snat_out2in_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->sw_if_index = sw_if_index0; + t->next_index = next0; + t->session_index = ~0; + if (s0) + t->session_index = s0 - sm->per_thread_data[thread_index].sessions; + } + + pkts_processed += next0 != SNAT_OUT2IN_NEXT_DROP; + + + ip1 = vlib_buffer_get_current (b1); + udp1 = ip4_next_header (ip1); + tcp1 = (tcp_header_t *) udp1; + icmp1 = (icmp46_header_t *) udp1; + + sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_RX]; + rx_fib_index1 = vec_elt (sm->ip4_main->fib_index_by_sw_if_index, + sw_if_index1); + + if (PREDICT_FALSE(ip1->ttl == 1)) + { + vnet_buffer (b1)->sw_if_index[VLIB_TX] = (u32) ~ 0; + icmp4_error_set_vnet_buffer (b1, ICMP4_time_exceeded, + ICMP4_time_exceeded_ttl_exceeded_in_transit, + 0); + next1 = SNAT_OUT2IN_NEXT_ICMP_ERROR; + goto trace1; + } + + proto1 = ip_proto_to_snat_proto (ip1->protocol); + + if (PREDICT_FALSE (proto1 == ~0)) + { + s1 = snat_out2in_unknown_proto(sm, b1, ip1, rx_fib_index1, + thread_index, now, vm, node); + if (!s1) + next1 = SNAT_OUT2IN_NEXT_DROP; + goto trace1; + } + + if (PREDICT_FALSE (proto1 == SNAT_PROTOCOL_ICMP)) + { + next1 = icmp_out2in_slow_path + (sm, b1, ip1, icmp1, sw_if_index1, rx_fib_index1, node, + next1, now, thread_index, &s1); + goto trace1; + } + + key1.addr = ip1->dst_address; + key1.port = udp1->dst_port; + key1.protocol = proto1; + key1.fib_index = rx_fib_index1; + + kv1.key = key1.as_u64; + + if (clib_bihash_search_8_8 (&sm->per_thread_data[thread_index].out2in, + &kv1, &value1)) + { + /* Try to match static mapping by external address and port, + destination address and port in packet */ + if (snat_static_mapping_match(sm, key1, &sm1, 1, 0)) + { + b1->error = node->errors[SNAT_OUT2IN_ERROR_NO_TRANSLATION]; + /* + * Send DHCP packets to the ipv4 stack, or we won't + * be able to use dhcp client on the outside interface + */ + if (proto1 != SNAT_PROTOCOL_UDP + || (udp1->dst_port + != clib_host_to_net_u16(UDP_DST_PORT_dhcp_to_client))) + next1 = SNAT_OUT2IN_NEXT_DROP; + goto trace1; + } + + /* Create session initiated by host from external network */ + s1 = create_session_for_static_mapping(sm, b1, sm1, key1, node, + thread_index); + if (!s1) + { + next1 = SNAT_OUT2IN_NEXT_DROP; + goto trace1; + } + } + else + { + if (PREDICT_FALSE (value1.value == ~0ULL)) + { + s1 = snat_out2in_lb(sm, b1, ip1, rx_fib_index1, thread_index, + now, vm, node); + if (!s1) + next1 = SNAT_OUT2IN_NEXT_DROP; + goto trace1; + } + else + { + s1 = pool_elt_at_index ( + sm->per_thread_data[thread_index].sessions, + value1.value); + } + } + + old_addr1 = ip1->dst_address.as_u32; + ip1->dst_address = s1->in2out.addr; + new_addr1 = ip1->dst_address.as_u32; + vnet_buffer(b1)->sw_if_index[VLIB_TX] = s1->in2out.fib_index; + + sum1 = ip1->checksum; + sum1 = ip_csum_update (sum1, old_addr1, new_addr1, + ip4_header_t, + dst_address /* changed member */); + ip1->checksum = ip_csum_fold (sum1); + + if (PREDICT_TRUE(proto1 == SNAT_PROTOCOL_TCP)) + { + old_port1 = tcp1->dst_port; + tcp1->dst_port = s1->in2out.port; + new_port1 = tcp1->dst_port; + + sum1 = tcp1->checksum; + sum1 = ip_csum_update (sum1, old_addr1, new_addr1, + ip4_header_t, + dst_address /* changed member */); + + sum1 = ip_csum_update (sum1, old_port1, new_port1, + ip4_header_t /* cheat */, + length /* changed member */); + tcp1->checksum = ip_csum_fold(sum1); + } + else + { + old_port1 = udp1->dst_port; + udp1->dst_port = s1->in2out.port; + udp1->checksum = 0; + } + + /* Accounting */ + s1->last_heard = now; + s1->total_pkts++; + s1->total_bytes += vlib_buffer_length_in_chain (vm, b1); + /* Per-user LRU list maintenance for dynamic translation */ + if (!snat_is_session_static (s1)) + { + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, + s1->per_user_index); + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, + s1->per_user_list_head_index, + s1->per_user_index); + } + trace1: + + if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE) + && (b1->flags & VLIB_BUFFER_IS_TRACED))) + { + snat_out2in_trace_t *t = + vlib_add_trace (vm, node, b1, sizeof (*t)); + t->sw_if_index = sw_if_index1; + t->next_index = next1; + t->session_index = ~0; + if (s1) + t->session_index = s1 - sm->per_thread_data[thread_index].sessions; + } + + pkts_processed += next1 != SNAT_OUT2IN_NEXT_DROP; + + /* verify speculative enqueues, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, + to_next, n_left_to_next, + bi0, bi1, next0, next1); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t * b0; + u32 next0 = SNAT_OUT2IN_NEXT_LOOKUP; + u32 sw_if_index0; + ip4_header_t * ip0; + ip_csum_t sum0; + u32 new_addr0, old_addr0; + u16 new_port0, old_port0; + udp_header_t * udp0; + tcp_header_t * tcp0; + icmp46_header_t * icmp0; + snat_session_key_t key0, sm0; + u32 rx_fib_index0; + u32 proto0; + snat_session_t * s0 = 0; + clib_bihash_kv_8_8_t kv0, value0; + + /* speculatively enqueue b0 to the current next frame */ + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + vnet_buffer (b0)->snat.flags = 0; + + ip0 = vlib_buffer_get_current (b0); + udp0 = ip4_next_header (ip0); + tcp0 = (tcp_header_t *) udp0; + icmp0 = (icmp46_header_t *) udp0; + + sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX]; + rx_fib_index0 = vec_elt (sm->ip4_main->fib_index_by_sw_if_index, + sw_if_index0); + + proto0 = ip_proto_to_snat_proto (ip0->protocol); + + if (PREDICT_FALSE (proto0 == ~0)) + { + s0 = snat_out2in_unknown_proto(sm, b0, ip0, rx_fib_index0, + thread_index, now, vm, node); + if (!s0) + next0 = SNAT_OUT2IN_NEXT_DROP; + goto trace00; + } + + if (PREDICT_FALSE(ip0->ttl == 1)) + { + vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; + icmp4_error_set_vnet_buffer (b0, ICMP4_time_exceeded, + ICMP4_time_exceeded_ttl_exceeded_in_transit, + 0); + next0 = SNAT_OUT2IN_NEXT_ICMP_ERROR; + goto trace00; + } + + if (PREDICT_FALSE (proto0 == SNAT_PROTOCOL_ICMP)) + { + next0 = icmp_out2in_slow_path + (sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, node, + next0, now, thread_index, &s0); + goto trace00; + } + + key0.addr = ip0->dst_address; + key0.port = udp0->dst_port; + key0.protocol = proto0; + key0.fib_index = rx_fib_index0; + + kv0.key = key0.as_u64; + + if (clib_bihash_search_8_8 (&sm->per_thread_data[thread_index].out2in, + &kv0, &value0)) + { + /* Try to match static mapping by external address and port, + destination address and port in packet */ + if (snat_static_mapping_match(sm, key0, &sm0, 1, 0)) + { + b0->error = node->errors[SNAT_OUT2IN_ERROR_NO_TRANSLATION]; + /* + * Send DHCP packets to the ipv4 stack, or we won't + * be able to use dhcp client on the outside interface + */ + if (proto0 != SNAT_PROTOCOL_UDP + || (udp0->dst_port + != clib_host_to_net_u16(UDP_DST_PORT_dhcp_to_client))) + + next0 = SNAT_OUT2IN_NEXT_DROP; + goto trace00; + } + + /* Create session initiated by host from external network */ + s0 = create_session_for_static_mapping(sm, b0, sm0, key0, node, + thread_index); + if (!s0) + { + next0 = SNAT_OUT2IN_NEXT_DROP; + goto trace00; + } + } + else + { + if (PREDICT_FALSE (value0.value == ~0ULL)) + { + s0 = snat_out2in_lb(sm, b0, ip0, rx_fib_index0, thread_index, + now, vm, node); + if (!s0) + next0 = SNAT_OUT2IN_NEXT_DROP; + goto trace00; + } + else + { + s0 = pool_elt_at_index ( + sm->per_thread_data[thread_index].sessions, + value0.value); + } + } + + old_addr0 = ip0->dst_address.as_u32; + ip0->dst_address = s0->in2out.addr; + new_addr0 = ip0->dst_address.as_u32; + vnet_buffer(b0)->sw_if_index[VLIB_TX] = s0->in2out.fib_index; + + sum0 = ip0->checksum; + sum0 = ip_csum_update (sum0, old_addr0, new_addr0, + ip4_header_t, + dst_address /* changed member */); + ip0->checksum = ip_csum_fold (sum0); + + if (PREDICT_TRUE(proto0 == SNAT_PROTOCOL_TCP)) + { + old_port0 = tcp0->dst_port; + tcp0->dst_port = s0->in2out.port; + new_port0 = tcp0->dst_port; + + sum0 = tcp0->checksum; + sum0 = ip_csum_update (sum0, old_addr0, new_addr0, + ip4_header_t, + dst_address /* changed member */); + + sum0 = ip_csum_update (sum0, old_port0, new_port0, + ip4_header_t /* cheat */, + length /* changed member */); + tcp0->checksum = ip_csum_fold(sum0); + } + else + { + old_port0 = udp0->dst_port; + udp0->dst_port = s0->in2out.port; + udp0->checksum = 0; + } + + /* Accounting */ + s0->last_heard = now; + s0->total_pkts++; + s0->total_bytes += vlib_buffer_length_in_chain (vm, b0); + /* Per-user LRU list maintenance for dynamic translation */ + if (!snat_is_session_static (s0)) + { + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, + s0->per_user_index); + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, + s0->per_user_list_head_index, + s0->per_user_index); + } + trace00: + + if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE) + && (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + snat_out2in_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->sw_if_index = sw_if_index0; + t->next_index = next0; + t->session_index = ~0; + if (s0) + t->session_index = s0 - sm->per_thread_data[thread_index].sessions; + } + + pkts_processed += next0 != SNAT_OUT2IN_NEXT_DROP; + + /* verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + vlib_node_increment_counter (vm, snat_out2in_node.index, + SNAT_OUT2IN_ERROR_OUT2IN_PACKETS, + pkts_processed); + return frame->n_vectors; +} + +VLIB_REGISTER_NODE (snat_out2in_node) = { + .function = snat_out2in_node_fn, + .name = "nat44-out2in", + .vector_size = sizeof (u32), + .format_trace = format_snat_out2in_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .n_errors = ARRAY_LEN(snat_out2in_error_strings), + .error_strings = snat_out2in_error_strings, + + .runtime_data_bytes = sizeof (snat_runtime_t), + + .n_next_nodes = SNAT_OUT2IN_N_NEXT, + + /* edit / add dispositions here */ + .next_nodes = { + [SNAT_OUT2IN_NEXT_DROP] = "error-drop", + [SNAT_OUT2IN_NEXT_LOOKUP] = "ip4-lookup", + [SNAT_OUT2IN_NEXT_ICMP_ERROR] = "ip4-icmp-error", + }, +}; +VLIB_NODE_FUNCTION_MULTIARCH (snat_out2in_node, snat_out2in_node_fn); + +/**************************/ +/*** deterministic mode ***/ +/**************************/ +static uword +snat_det_out2in_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + u32 n_left_from, * from, * to_next; + snat_out2in_next_t next_index; + u32 pkts_processed = 0; + snat_main_t * sm = &snat_main; + u32 thread_index = vlib_get_thread_index (); + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, + to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next >= 2) + { + u32 bi0, bi1; + vlib_buffer_t * b0, * b1; + u32 next0 = SNAT_OUT2IN_NEXT_LOOKUP; + u32 next1 = SNAT_OUT2IN_NEXT_LOOKUP; + u32 sw_if_index0, sw_if_index1; + ip4_header_t * ip0, * ip1; + ip_csum_t sum0, sum1; + ip4_address_t new_addr0, old_addr0, new_addr1, old_addr1; + u16 new_port0, old_port0, old_port1, new_port1; + udp_header_t * udp0, * udp1; + tcp_header_t * tcp0, * tcp1; + u32 proto0, proto1; + snat_det_out_key_t key0, key1; + snat_det_map_t * dm0, * dm1; + snat_det_session_t * ses0 = 0, * ses1 = 0; + u32 rx_fib_index0, rx_fib_index1; + icmp46_header_t * icmp0, * icmp1; + + /* Prefetch next iteration. */ + { + vlib_buffer_t * p2, * p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + + CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE); + } + + /* speculatively enqueue b0 and b1 to the current next frame */ + to_next[0] = bi0 = from[0]; + to_next[1] = bi1 = from[1]; + from += 2; + to_next += 2; + n_left_from -= 2; + n_left_to_next -= 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + ip0 = vlib_buffer_get_current (b0); + udp0 = ip4_next_header (ip0); + tcp0 = (tcp_header_t *) udp0; + + sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX]; + + if (PREDICT_FALSE(ip0->ttl == 1)) + { + vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; + icmp4_error_set_vnet_buffer (b0, ICMP4_time_exceeded, + ICMP4_time_exceeded_ttl_exceeded_in_transit, + 0); + next0 = SNAT_OUT2IN_NEXT_ICMP_ERROR; + goto trace0; + } + + proto0 = ip_proto_to_snat_proto (ip0->protocol); + + if (PREDICT_FALSE(proto0 == SNAT_PROTOCOL_ICMP)) + { + rx_fib_index0 = ip4_fib_table_get_index_for_sw_if_index(sw_if_index0); + icmp0 = (icmp46_header_t *) udp0; + + next0 = icmp_out2in(sm, b0, ip0, icmp0, sw_if_index0, + rx_fib_index0, node, next0, thread_index, + &ses0, &dm0); + goto trace0; + } + + key0.ext_host_addr = ip0->src_address; + key0.ext_host_port = tcp0->src; + key0.out_port = tcp0->dst; + + dm0 = snat_det_map_by_out(sm, &ip0->dst_address); + if (PREDICT_FALSE(!dm0)) + { + clib_warning("unknown dst address: %U", + format_ip4_address, &ip0->dst_address); + next0 = SNAT_OUT2IN_NEXT_DROP; + b0->error = node->errors[SNAT_OUT2IN_ERROR_NO_TRANSLATION]; + goto trace0; + } + + snat_det_reverse(dm0, &ip0->dst_address, + clib_net_to_host_u16(tcp0->dst), &new_addr0); + + ses0 = snat_det_get_ses_by_out (dm0, &new_addr0, key0.as_u64); + if (PREDICT_FALSE(!ses0)) + { + clib_warning("no match src %U:%d dst %U:%d for user %U", + format_ip4_address, &ip0->src_address, + clib_net_to_host_u16 (tcp0->src), + format_ip4_address, &ip0->dst_address, + clib_net_to_host_u16 (tcp0->dst), + format_ip4_address, &new_addr0); + next0 = SNAT_OUT2IN_NEXT_DROP; + b0->error = node->errors[SNAT_OUT2IN_ERROR_NO_TRANSLATION]; + goto trace0; + } + new_port0 = ses0->in_port; + + old_addr0 = ip0->dst_address; + ip0->dst_address = new_addr0; + vnet_buffer(b0)->sw_if_index[VLIB_TX] = sm->inside_fib_index; + + sum0 = ip0->checksum; + sum0 = ip_csum_update (sum0, old_addr0.as_u32, new_addr0.as_u32, + ip4_header_t, + dst_address /* changed member */); + ip0->checksum = ip_csum_fold (sum0); + + if (PREDICT_TRUE(proto0 == SNAT_PROTOCOL_TCP)) + { + if (tcp0->flags & TCP_FLAG_FIN && ses0->state == SNAT_SESSION_TCP_ESTABLISHED) + ses0->state = SNAT_SESSION_TCP_CLOSE_WAIT; + else if (tcp0->flags & TCP_FLAG_ACK && ses0->state == SNAT_SESSION_TCP_LAST_ACK) + snat_det_ses_close(dm0, ses0); + + old_port0 = tcp0->dst; + tcp0->dst = new_port0; + + sum0 = tcp0->checksum; + sum0 = ip_csum_update (sum0, old_addr0.as_u32, new_addr0.as_u32, + ip4_header_t, + dst_address /* changed member */); + + sum0 = ip_csum_update (sum0, old_port0, new_port0, + ip4_header_t /* cheat */, + length /* changed member */); + tcp0->checksum = ip_csum_fold(sum0); + } + else + { + old_port0 = udp0->dst_port; + udp0->dst_port = new_port0; + udp0->checksum = 0; + } + + trace0: + + if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE) + && (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + snat_out2in_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->sw_if_index = sw_if_index0; + t->next_index = next0; + t->session_index = ~0; + if (ses0) + t->session_index = ses0 - dm0->sessions; + } + + pkts_processed += next0 != SNAT_OUT2IN_NEXT_DROP; + + b1 = vlib_get_buffer (vm, bi1); + + ip1 = vlib_buffer_get_current (b1); + udp1 = ip4_next_header (ip1); + tcp1 = (tcp_header_t *) udp1; + + sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_RX]; + + if (PREDICT_FALSE(ip1->ttl == 1)) + { + vnet_buffer (b1)->sw_if_index[VLIB_TX] = (u32) ~ 0; + icmp4_error_set_vnet_buffer (b1, ICMP4_time_exceeded, + ICMP4_time_exceeded_ttl_exceeded_in_transit, + 0); + next1 = SNAT_OUT2IN_NEXT_ICMP_ERROR; + goto trace1; + } + + proto1 = ip_proto_to_snat_proto (ip1->protocol); + + if (PREDICT_FALSE(proto1 == SNAT_PROTOCOL_ICMP)) + { + rx_fib_index1 = ip4_fib_table_get_index_for_sw_if_index(sw_if_index1); + icmp1 = (icmp46_header_t *) udp1; + + next1 = icmp_out2in(sm, b1, ip1, icmp1, sw_if_index1, + rx_fib_index1, node, next1, thread_index, + &ses1, &dm1); + goto trace1; + } + + key1.ext_host_addr = ip1->src_address; + key1.ext_host_port = tcp1->src; + key1.out_port = tcp1->dst; + + dm1 = snat_det_map_by_out(sm, &ip1->dst_address); + if (PREDICT_FALSE(!dm1)) + { + clib_warning("unknown dst address: %U", + format_ip4_address, &ip1->dst_address); + next1 = SNAT_OUT2IN_NEXT_DROP; + b1->error = node->errors[SNAT_OUT2IN_ERROR_NO_TRANSLATION]; + goto trace1; + } + + snat_det_reverse(dm1, &ip1->dst_address, + clib_net_to_host_u16(tcp1->dst), &new_addr1); + + ses1 = snat_det_get_ses_by_out (dm1, &new_addr1, key1.as_u64); + if (PREDICT_FALSE(!ses1)) + { + clib_warning("no match src %U:%d dst %U:%d for user %U", + format_ip4_address, &ip1->src_address, + clib_net_to_host_u16 (tcp1->src), + format_ip4_address, &ip1->dst_address, + clib_net_to_host_u16 (tcp1->dst), + format_ip4_address, &new_addr1); + next1 = SNAT_OUT2IN_NEXT_DROP; + b1->error = node->errors[SNAT_OUT2IN_ERROR_NO_TRANSLATION]; + goto trace1; + } + new_port1 = ses1->in_port; + + old_addr1 = ip1->dst_address; + ip1->dst_address = new_addr1; + vnet_buffer(b1)->sw_if_index[VLIB_TX] = sm->inside_fib_index; + + sum1 = ip1->checksum; + sum1 = ip_csum_update (sum1, old_addr1.as_u32, new_addr1.as_u32, + ip4_header_t, + dst_address /* changed member */); + ip1->checksum = ip_csum_fold (sum1); + + if (PREDICT_TRUE(proto1 == SNAT_PROTOCOL_TCP)) + { + if (tcp1->flags & TCP_FLAG_FIN && ses1->state == SNAT_SESSION_TCP_ESTABLISHED) + ses1->state = SNAT_SESSION_TCP_CLOSE_WAIT; + else if (tcp1->flags & TCP_FLAG_ACK && ses1->state == SNAT_SESSION_TCP_LAST_ACK) + snat_det_ses_close(dm1, ses1); + + old_port1 = tcp1->dst; + tcp1->dst = new_port1; + + sum1 = tcp1->checksum; + sum1 = ip_csum_update (sum1, old_addr1.as_u32, new_addr1.as_u32, + ip4_header_t, + dst_address /* changed member */); + + sum1 = ip_csum_update (sum1, old_port1, new_port1, + ip4_header_t /* cheat */, + length /* changed member */); + tcp1->checksum = ip_csum_fold(sum1); + } + else + { + old_port1 = udp1->dst_port; + udp1->dst_port = new_port1; + udp1->checksum = 0; + } + + trace1: + + if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE) + && (b1->flags & VLIB_BUFFER_IS_TRACED))) + { + snat_out2in_trace_t *t = + vlib_add_trace (vm, node, b1, sizeof (*t)); + t->sw_if_index = sw_if_index1; + t->next_index = next1; + t->session_index = ~0; + if (ses1) + t->session_index = ses1 - dm1->sessions; + } + + pkts_processed += next1 != SNAT_OUT2IN_NEXT_DROP; + + /* verify speculative enqueues, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, + to_next, n_left_to_next, + bi0, bi1, next0, next1); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t * b0; + u32 next0 = SNAT_OUT2IN_NEXT_LOOKUP; + u32 sw_if_index0; + ip4_header_t * ip0; + ip_csum_t sum0; + ip4_address_t new_addr0, old_addr0; + u16 new_port0, old_port0; + udp_header_t * udp0; + tcp_header_t * tcp0; + u32 proto0; + snat_det_out_key_t key0; + snat_det_map_t * dm0; + snat_det_session_t * ses0 = 0; + u32 rx_fib_index0; + icmp46_header_t * icmp0; + + /* speculatively enqueue b0 to the current next frame */ + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + ip0 = vlib_buffer_get_current (b0); + udp0 = ip4_next_header (ip0); + tcp0 = (tcp_header_t *) udp0; + + sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX]; + + if (PREDICT_FALSE(ip0->ttl == 1)) + { + vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; + icmp4_error_set_vnet_buffer (b0, ICMP4_time_exceeded, + ICMP4_time_exceeded_ttl_exceeded_in_transit, + 0); + next0 = SNAT_OUT2IN_NEXT_ICMP_ERROR; + goto trace00; + } + + proto0 = ip_proto_to_snat_proto (ip0->protocol); + + if (PREDICT_FALSE(proto0 == SNAT_PROTOCOL_ICMP)) + { + rx_fib_index0 = ip4_fib_table_get_index_for_sw_if_index(sw_if_index0); + icmp0 = (icmp46_header_t *) udp0; + + next0 = icmp_out2in(sm, b0, ip0, icmp0, sw_if_index0, + rx_fib_index0, node, next0, thread_index, + &ses0, &dm0); + goto trace00; + } + + key0.ext_host_addr = ip0->src_address; + key0.ext_host_port = tcp0->src; + key0.out_port = tcp0->dst; + + dm0 = snat_det_map_by_out(sm, &ip0->dst_address); + if (PREDICT_FALSE(!dm0)) + { + clib_warning("unknown dst address: %U", + format_ip4_address, &ip0->dst_address); + next0 = SNAT_OUT2IN_NEXT_DROP; + b0->error = node->errors[SNAT_OUT2IN_ERROR_NO_TRANSLATION]; + goto trace00; + } + + snat_det_reverse(dm0, &ip0->dst_address, + clib_net_to_host_u16(tcp0->dst), &new_addr0); + + ses0 = snat_det_get_ses_by_out (dm0, &new_addr0, key0.as_u64); + if (PREDICT_FALSE(!ses0)) + { + clib_warning("no match src %U:%d dst %U:%d for user %U", + format_ip4_address, &ip0->src_address, + clib_net_to_host_u16 (tcp0->src), + format_ip4_address, &ip0->dst_address, + clib_net_to_host_u16 (tcp0->dst), + format_ip4_address, &new_addr0); + next0 = SNAT_OUT2IN_NEXT_DROP; + b0->error = node->errors[SNAT_OUT2IN_ERROR_NO_TRANSLATION]; + goto trace00; + } + new_port0 = ses0->in_port; + + old_addr0 = ip0->dst_address; + ip0->dst_address = new_addr0; + vnet_buffer(b0)->sw_if_index[VLIB_TX] = sm->inside_fib_index; + + sum0 = ip0->checksum; + sum0 = ip_csum_update (sum0, old_addr0.as_u32, new_addr0.as_u32, + ip4_header_t, + dst_address /* changed member */); + ip0->checksum = ip_csum_fold (sum0); + + if (PREDICT_TRUE(proto0 == SNAT_PROTOCOL_TCP)) + { + if (tcp0->flags & TCP_FLAG_FIN && ses0->state == SNAT_SESSION_TCP_ESTABLISHED) + ses0->state = SNAT_SESSION_TCP_CLOSE_WAIT; + else if (tcp0->flags & TCP_FLAG_ACK && ses0->state == SNAT_SESSION_TCP_LAST_ACK) + snat_det_ses_close(dm0, ses0); + + old_port0 = tcp0->dst; + tcp0->dst = new_port0; + + sum0 = tcp0->checksum; + sum0 = ip_csum_update (sum0, old_addr0.as_u32, new_addr0.as_u32, + ip4_header_t, + dst_address /* changed member */); + + sum0 = ip_csum_update (sum0, old_port0, new_port0, + ip4_header_t /* cheat */, + length /* changed member */); + tcp0->checksum = ip_csum_fold(sum0); + } + else + { + old_port0 = udp0->dst_port; + udp0->dst_port = new_port0; + udp0->checksum = 0; + } + + trace00: + + if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE) + && (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + snat_out2in_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->sw_if_index = sw_if_index0; + t->next_index = next0; + t->session_index = ~0; + if (ses0) + t->session_index = ses0 - dm0->sessions; + } + + pkts_processed += next0 != SNAT_OUT2IN_NEXT_DROP; + + /* verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + vlib_node_increment_counter (vm, snat_det_out2in_node.index, + SNAT_OUT2IN_ERROR_OUT2IN_PACKETS, + pkts_processed); + return frame->n_vectors; +} + +VLIB_REGISTER_NODE (snat_det_out2in_node) = { + .function = snat_det_out2in_node_fn, + .name = "nat44-det-out2in", + .vector_size = sizeof (u32), + .format_trace = format_snat_out2in_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .n_errors = ARRAY_LEN(snat_out2in_error_strings), + .error_strings = snat_out2in_error_strings, + + .runtime_data_bytes = sizeof (snat_runtime_t), + + .n_next_nodes = SNAT_OUT2IN_N_NEXT, + + /* edit / add dispositions here */ + .next_nodes = { + [SNAT_OUT2IN_NEXT_DROP] = "error-drop", + [SNAT_OUT2IN_NEXT_LOOKUP] = "ip4-lookup", + [SNAT_OUT2IN_NEXT_ICMP_ERROR] = "ip4-icmp-error", + }, +}; +VLIB_NODE_FUNCTION_MULTIARCH (snat_det_out2in_node, snat_det_out2in_node_fn); + +/** + * Get address and port values to be used for ICMP packet translation + * and create session if needed + * + * @param[in,out] sm NAT main + * @param[in,out] node NAT node runtime + * @param[in] thread_index thread index + * @param[in,out] b0 buffer containing packet to be translated + * @param[out] p_proto protocol used for matching + * @param[out] p_value address and port after NAT translation + * @param[out] p_dont_translate if packet should not be translated + * @param d optional parameter + * @param e optional parameter + */ +u32 icmp_match_out2in_det(snat_main_t *sm, vlib_node_runtime_t *node, + u32 thread_index, vlib_buffer_t *b0, + ip4_header_t *ip0, u8 *p_proto, + snat_session_key_t *p_value, + u8 *p_dont_translate, void *d, void *e) +{ + icmp46_header_t *icmp0; + u32 sw_if_index0; + u8 protocol; + snat_det_out_key_t key0; + u8 dont_translate = 0; + u32 next0 = ~0; + icmp_echo_header_t *echo0, *inner_echo0 = 0; + ip4_header_t *inner_ip0; + void *l4_header = 0; + icmp46_header_t *inner_icmp0; + snat_det_map_t * dm0 = 0; + ip4_address_t new_addr0 = {{0}}; + snat_det_session_t * ses0 = 0; + ip4_address_t out_addr; + + icmp0 = (icmp46_header_t *) ip4_next_header (ip0); + echo0 = (icmp_echo_header_t *)(icmp0+1); + sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX]; + + if (!icmp_is_error_message (icmp0)) + { + protocol = SNAT_PROTOCOL_ICMP; + key0.ext_host_addr = ip0->src_address; + key0.ext_host_port = 0; + key0.out_port = echo0->identifier; + out_addr = ip0->dst_address; + } + else + { + inner_ip0 = (ip4_header_t *)(echo0+1); + l4_header = ip4_next_header (inner_ip0); + protocol = ip_proto_to_snat_proto (inner_ip0->protocol); + key0.ext_host_addr = inner_ip0->dst_address; + out_addr = inner_ip0->src_address; + switch (protocol) + { + case SNAT_PROTOCOL_ICMP: + inner_icmp0 = (icmp46_header_t*)l4_header; + inner_echo0 = (icmp_echo_header_t *)(inner_icmp0+1); + key0.ext_host_port = 0; + key0.out_port = inner_echo0->identifier; + break; + case SNAT_PROTOCOL_UDP: + case SNAT_PROTOCOL_TCP: + key0.ext_host_port = ((tcp_udp_header_t*)l4_header)->dst_port; + key0.out_port = ((tcp_udp_header_t*)l4_header)->src_port; + break; + default: + b0->error = node->errors[SNAT_OUT2IN_ERROR_UNSUPPORTED_PROTOCOL]; + next0 = SNAT_OUT2IN_NEXT_DROP; + goto out; + } + } + + dm0 = snat_det_map_by_out(sm, &out_addr); + if (PREDICT_FALSE(!dm0)) + { + /* Don't NAT packet aimed at the intfc address */ + if (PREDICT_FALSE(is_interface_addr(sm, node, sw_if_index0, + ip0->dst_address.as_u32))) + { + dont_translate = 1; + goto out; + } + clib_warning("unknown dst address: %U", + format_ip4_address, &ip0->dst_address); + goto out; + } + + snat_det_reverse(dm0, &ip0->dst_address, + clib_net_to_host_u16(key0.out_port), &new_addr0); + + ses0 = snat_det_get_ses_by_out (dm0, &new_addr0, key0.as_u64); + if (PREDICT_FALSE(!ses0)) + { + /* Don't NAT packet aimed at the intfc address */ + if (PREDICT_FALSE(is_interface_addr(sm, node, sw_if_index0, + ip0->dst_address.as_u32))) + { + dont_translate = 1; + goto out; + } + clib_warning("no match src %U:%d dst %U:%d for user %U", + format_ip4_address, &key0.ext_host_addr, + clib_net_to_host_u16 (key0.ext_host_port), + format_ip4_address, &out_addr, + clib_net_to_host_u16 (key0.out_port), + format_ip4_address, &new_addr0); + b0->error = node->errors[SNAT_OUT2IN_ERROR_NO_TRANSLATION]; + next0 = SNAT_OUT2IN_NEXT_DROP; + goto out; + } + + if (PREDICT_FALSE(icmp0->type != ICMP4_echo_reply && + !icmp_is_error_message (icmp0))) + { + b0->error = node->errors[SNAT_OUT2IN_ERROR_BAD_ICMP_TYPE]; + next0 = SNAT_OUT2IN_NEXT_DROP; + goto out; + } + + goto out; + +out: + *p_proto = protocol; + if (ses0) + { + p_value->addr = new_addr0; + p_value->fib_index = sm->inside_fib_index; + p_value->port = ses0->in_port; + } + *p_dont_translate = dont_translate; + if (d) + *(snat_det_session_t**)d = ses0; + if (e) + *(snat_det_map_t**)e = dm0; + return next0; +} + +/**********************/ +/*** worker handoff ***/ +/**********************/ +static uword +snat_out2in_worker_handoff_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + snat_main_t *sm = &snat_main; + vlib_thread_main_t *tm = vlib_get_thread_main (); + u32 n_left_from, *from, *to_next = 0; + static __thread vlib_frame_queue_elt_t **handoff_queue_elt_by_worker_index; + static __thread vlib_frame_queue_t **congested_handoff_queue_by_worker_index + = 0; + vlib_frame_queue_elt_t *hf = 0; + vlib_frame_t *f = 0; + int i; + u32 n_left_to_next_worker = 0, *to_next_worker = 0; + u32 next_worker_index = 0; + u32 current_worker_index = ~0; + u32 thread_index = vlib_get_thread_index (); + + ASSERT (vec_len (sm->workers)); + + if (PREDICT_FALSE (handoff_queue_elt_by_worker_index == 0)) + { + vec_validate (handoff_queue_elt_by_worker_index, tm->n_vlib_mains - 1); + + vec_validate_init_empty (congested_handoff_queue_by_worker_index, + sm->first_worker_index + sm->num_workers - 1, + (vlib_frame_queue_t *) (~0)); + } + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + + while (n_left_from > 0) + { + u32 bi0; + vlib_buffer_t *b0; + u32 sw_if_index0; + u32 rx_fib_index0; + ip4_header_t * ip0; + u8 do_handoff; + + bi0 = from[0]; + from += 1; + n_left_from -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX]; + rx_fib_index0 = ip4_fib_table_get_index_for_sw_if_index(sw_if_index0); + + ip0 = vlib_buffer_get_current (b0); + + next_worker_index = sm->worker_out2in_cb(ip0, rx_fib_index0); + + if (PREDICT_FALSE (next_worker_index != thread_index)) + { + do_handoff = 1; + + if (next_worker_index != current_worker_index) + { + if (hf) + hf->n_vectors = VLIB_FRAME_SIZE - n_left_to_next_worker; + + hf = vlib_get_worker_handoff_queue_elt (sm->fq_out2in_index, + next_worker_index, + handoff_queue_elt_by_worker_index); + + n_left_to_next_worker = VLIB_FRAME_SIZE - hf->n_vectors; + to_next_worker = &hf->buffer_index[hf->n_vectors]; + current_worker_index = next_worker_index; + } + + /* enqueue to correct worker thread */ + to_next_worker[0] = bi0; + to_next_worker++; + n_left_to_next_worker--; + + if (n_left_to_next_worker == 0) + { + hf->n_vectors = VLIB_FRAME_SIZE; + vlib_put_frame_queue_elt (hf); + current_worker_index = ~0; + handoff_queue_elt_by_worker_index[next_worker_index] = 0; + hf = 0; + } + } + else + { + do_handoff = 0; + /* if this is 1st frame */ + if (!f) + { + f = vlib_get_frame_to_node (vm, sm->out2in_node_index); + to_next = vlib_frame_vector_args (f); + } + + to_next[0] = bi0; + to_next += 1; + f->n_vectors++; + } + + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) + && (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + snat_out2in_worker_handoff_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->next_worker_index = next_worker_index; + t->do_handoff = do_handoff; + } + } + + if (f) + vlib_put_frame_to_node (vm, sm->out2in_node_index, f); + + if (hf) + hf->n_vectors = VLIB_FRAME_SIZE - n_left_to_next_worker; + + /* Ship frames to the worker nodes */ + for (i = 0; i < vec_len (handoff_queue_elt_by_worker_index); i++) + { + if (handoff_queue_elt_by_worker_index[i]) + { + hf = handoff_queue_elt_by_worker_index[i]; + /* + * It works better to let the handoff node + * rate-adapt, always ship the handoff queue element. + */ + if (1 || hf->n_vectors == hf->last_n_vectors) + { + vlib_put_frame_queue_elt (hf); + handoff_queue_elt_by_worker_index[i] = 0; + } + else + hf->last_n_vectors = hf->n_vectors; + } + congested_handoff_queue_by_worker_index[i] = + (vlib_frame_queue_t *) (~0); + } + hf = 0; + current_worker_index = ~0; + return frame->n_vectors; +} + +VLIB_REGISTER_NODE (snat_out2in_worker_handoff_node) = { + .function = snat_out2in_worker_handoff_fn, + .name = "nat44-out2in-worker-handoff", + .vector_size = sizeof (u32), + .format_trace = format_snat_out2in_worker_handoff_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .n_next_nodes = 1, + + .next_nodes = { + [0] = "error-drop", + }, +}; + +VLIB_NODE_FUNCTION_MULTIARCH (snat_out2in_worker_handoff_node, snat_out2in_worker_handoff_fn); + +static uword +snat_out2in_fast_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + u32 n_left_from, * from, * to_next; + snat_out2in_next_t next_index; + u32 pkts_processed = 0; + snat_main_t * sm = &snat_main; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, + to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t * b0; + u32 next0 = SNAT_OUT2IN_NEXT_DROP; + u32 sw_if_index0; + ip4_header_t * ip0; + ip_csum_t sum0; + u32 new_addr0, old_addr0; + u16 new_port0, old_port0; + udp_header_t * udp0; + tcp_header_t * tcp0; + icmp46_header_t * icmp0; + snat_session_key_t key0, sm0; + u32 proto0; + u32 rx_fib_index0; + + /* speculatively enqueue b0 to the current next frame */ + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + ip0 = vlib_buffer_get_current (b0); + udp0 = ip4_next_header (ip0); + tcp0 = (tcp_header_t *) udp0; + icmp0 = (icmp46_header_t *) udp0; + + sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX]; + rx_fib_index0 = ip4_fib_table_get_index_for_sw_if_index(sw_if_index0); + + vnet_feature_next (sw_if_index0, &next0, b0); + + if (PREDICT_FALSE(ip0->ttl == 1)) + { + vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; + icmp4_error_set_vnet_buffer (b0, ICMP4_time_exceeded, + ICMP4_time_exceeded_ttl_exceeded_in_transit, + 0); + next0 = SNAT_OUT2IN_NEXT_ICMP_ERROR; + goto trace00; + } + + proto0 = ip_proto_to_snat_proto (ip0->protocol); + + if (PREDICT_FALSE (proto0 == ~0)) + goto trace00; + + if (PREDICT_FALSE (proto0 == SNAT_PROTOCOL_ICMP)) + { + next0 = icmp_out2in(sm, b0, ip0, icmp0, sw_if_index0, + rx_fib_index0, node, next0, ~0, 0, 0); + goto trace00; + } + + key0.addr = ip0->dst_address; + key0.port = udp0->dst_port; + key0.fib_index = rx_fib_index0; + + if (snat_static_mapping_match(sm, key0, &sm0, 1, 0)) + { + b0->error = node->errors[SNAT_OUT2IN_ERROR_NO_TRANSLATION]; + goto trace00; + } + + new_addr0 = sm0.addr.as_u32; + new_port0 = sm0.port; + vnet_buffer(b0)->sw_if_index[VLIB_TX] = sm0.fib_index; + old_addr0 = ip0->dst_address.as_u32; + ip0->dst_address.as_u32 = new_addr0; + + sum0 = ip0->checksum; + sum0 = ip_csum_update (sum0, old_addr0, new_addr0, + ip4_header_t, + dst_address /* changed member */); + ip0->checksum = ip_csum_fold (sum0); + + if (PREDICT_FALSE(new_port0 != udp0->dst_port)) + { + if (PREDICT_TRUE(proto0 == SNAT_PROTOCOL_TCP)) + { + old_port0 = tcp0->dst_port; + tcp0->dst_port = new_port0; + + sum0 = tcp0->checksum; + sum0 = ip_csum_update (sum0, old_addr0, new_addr0, + ip4_header_t, + dst_address /* changed member */); + + sum0 = ip_csum_update (sum0, old_port0, new_port0, + ip4_header_t /* cheat */, + length /* changed member */); + tcp0->checksum = ip_csum_fold(sum0); + } + else + { + old_port0 = udp0->dst_port; + udp0->dst_port = new_port0; + udp0->checksum = 0; + } + } + else + { + if (PREDICT_TRUE(proto0 == SNAT_PROTOCOL_TCP)) + { + sum0 = tcp0->checksum; + sum0 = ip_csum_update (sum0, old_addr0, new_addr0, + ip4_header_t, + dst_address /* changed member */); + + tcp0->checksum = ip_csum_fold(sum0); + } + } + + trace00: + + if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE) + && (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + snat_out2in_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->sw_if_index = sw_if_index0; + t->next_index = next0; + } + + pkts_processed += next0 != SNAT_OUT2IN_NEXT_DROP; + + /* verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + vlib_node_increment_counter (vm, snat_out2in_fast_node.index, + SNAT_OUT2IN_ERROR_OUT2IN_PACKETS, + pkts_processed); + return frame->n_vectors; +} + +VLIB_REGISTER_NODE (snat_out2in_fast_node) = { + .function = snat_out2in_fast_node_fn, + .name = "nat44-out2in-fast", + .vector_size = sizeof (u32), + .format_trace = format_snat_out2in_fast_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .n_errors = ARRAY_LEN(snat_out2in_error_strings), + .error_strings = snat_out2in_error_strings, + + .runtime_data_bytes = sizeof (snat_runtime_t), + + .n_next_nodes = SNAT_OUT2IN_N_NEXT, + + /* edit / add dispositions here */ + .next_nodes = { + [SNAT_OUT2IN_NEXT_LOOKUP] = "ip4-lookup", + [SNAT_OUT2IN_NEXT_DROP] = "error-drop", + [SNAT_OUT2IN_NEXT_ICMP_ERROR] = "ip4-icmp-error", + }, +}; +VLIB_NODE_FUNCTION_MULTIARCH (snat_out2in_fast_node, snat_out2in_fast_node_fn); diff --git a/src/plugins/pppoe.am b/src/plugins/pppoe.am new file mode 100644 index 00000000..06ed60b4 --- /dev/null +++ b/src/plugins/pppoe.am @@ -0,0 +1,39 @@ +# Copyright (c) 2017 Intel and/or its affiliates. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +vppapitestplugins_LTLIBRARIES += pppoe_test_plugin.la +vppplugins_LTLIBRARIES += pppoe_plugin.la + +pppoe_plugin_la_SOURCES = \ + pppoe/pppoe_decap.c \ + pppoe/pppoe_tap.c \ + pppoe/pppoe_tap_node.c \ + pppoe/pppoe.c \ + pppoe/pppoe_api.c + +BUILT_SOURCES += \ + pppoe/pppoe.api.h \ + pppoe/pppoe.api.json + +API_FILES += pppoe/pppoe.api + +nobase_apiinclude_HEADERS += \ + pppoe/pppoe_all_api_h.h \ + pppoe/pppoe_msg_enum.h \ + pppoe/pppoe.api.h + +pppoe_test_plugin_la_SOURCES = \ + pppoe/pppoe_test.c \ + pppoe/pppoe_plugin.api.h + +# vi:syntax=automake diff --git a/src/plugins/pppoe/pppoe.api b/src/plugins/pppoe/pppoe.api new file mode 100644 index 00000000..e8cd989f --- /dev/null +++ b/src/plugins/pppoe/pppoe.api @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2017 Intel and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** \brief Set or delete an PPPOE session + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param is_add - add address if non-zero, else delete + @param is_ipv6 - client_ip and dst_address is ipv6 or not + @param session_id - PPPoE session ID + @param client_ip - PPPOE session's client address. + @param decap_vrf_id - the vrf index for pppoe decaped packet + @param client_mac - the client ethernet address +*/ +define pppoe_add_del_session +{ + u32 client_index; + u32 context; + u8 is_add; + u8 is_ipv6; + u16 session_id; + u8 client_ip[16]; + u32 decap_vrf_id; + u8 client_mac[6]; +}; + +/** \brief reply for set or delete an PPPOE session + @param context - sender context, to match reply w/ request + @param retval - return code + @param sw_if_index - software index of the interface +*/ +define pppoe_add_del_session_reply +{ + u32 context; + i32 retval; + u32 sw_if_index; +}; + +/** \brief Dump PPPOE session + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param sw_if_index - software index of the interface +*/ +define pppoe_session_dump +{ + u32 client_index; + u32 context; + u32 sw_if_index; +}; + +/** \brief dump details of an PPPOE session + @param context - sender context, to match reply w/ request + @param sw_if_index - software index of the interface + @param is_ipv6 - client_ip and dst_address is ipv6 or not + @param session_id - PPPoE session ID + @param client_ip - PPPOE session's client address. + @param encap_if_index - the index of tx interface for pppoe encaped packet + @param decap_vrf_id - the vrf index for pppoe decaped packet + @param local_mac - the local ethernet address + @param client_mac - the client ethernet address +*/ +define pppoe_session_details +{ + u32 context; + u32 sw_if_index; + u8 is_ipv6; + u16 session_id; + u8 client_ip[16]; + u32 encap_if_index; + u32 decap_vrf_id; + u8 local_mac[6]; + u8 client_mac[6]; +}; + +/* + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/pppoe/pppoe.c b/src/plugins/pppoe/pppoe.c new file mode 100644 index 00000000..e09ac7d9 --- /dev/null +++ b/src/plugins/pppoe/pppoe.c @@ -0,0 +1,739 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2017 Intel and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ +#include <stdint.h> +#include <net/if.h> +#include <sys/ioctl.h> +#include <inttypes.h> + +#include <vlib/vlib.h> +#include <vlib/unix/unix.h> +#include <vnet/ethernet/ethernet.h> +#include <vnet/fib/fib_entry.h> +#include <vnet/fib/fib_table.h> +#include <vnet/dpo/interface_tx_dpo.h> +#include <vnet/plugin/plugin.h> +#include <vpp/app/version.h> +#include <vnet/ppp/packet.h> +#include <pppoe/pppoe.h> +#include <vnet/adj/adj_midchain.h> +#include <vnet/adj/adj_mcast.h> + +#include <vppinfra/hash.h> +#include <vppinfra/bihash_template.c> + +pppoe_main_t pppoe_main; + +u8 * +format_pppoe_session (u8 * s, va_list * args) +{ + pppoe_session_t *t = va_arg (*args, pppoe_session_t *); + pppoe_main_t *pem = &pppoe_main; + + s = format (s, "[%d] sw-if-index %d client-ip %U session-id %d ", + t - pem->sessions, t->sw_if_index, + format_ip46_address, &t->client_ip, IP46_TYPE_ANY, + t->session_id); + + s = format (s, "encap-if-index %d decap-fib-index %d\n", + t->encap_if_index, t->decap_fib_index); + + s = format (s, " local-mac %U client-mac %U", + format_ethernet_address, t->local_mac, + format_ethernet_address, t->client_mac); + + return s; +} + +static u8 * +format_pppoe_name (u8 * s, va_list * args) +{ + u32 dev_instance = va_arg (*args, u32); + return format (s, "pppoe_session%d", dev_instance); +} + +static uword +dummy_interface_tx (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + clib_warning ("you shouldn't be here, leaking buffers..."); + return frame->n_vectors; +} + +static clib_error_t * +pppoe_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags) +{ + u32 hw_flags = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) ? + VNET_HW_INTERFACE_FLAG_LINK_UP : 0; + vnet_hw_interface_set_flags (vnm, hw_if_index, hw_flags); + + return /* no error */ 0; +} + +/* *INDENT-OFF* */ +VNET_DEVICE_CLASS (pppoe_device_class,static) = { + .name = "PPPPOE", + .format_device_name = format_pppoe_name, + .tx_function = dummy_interface_tx, + .admin_up_down_function = pppoe_interface_admin_up_down, +}; +/* *INDENT-ON* */ + +static u8 * +format_pppoe_header_with_length (u8 * s, va_list * args) +{ + u32 dev_instance = va_arg (*args, u32); + s = format (s, "unimplemented dev %u", dev_instance); + return s; +} + +static u8 * +pppoe_build_rewrite (vnet_main_t * vnm, + u32 sw_if_index, + vnet_link_t link_type, const void *dst_address) +{ + int len = sizeof (pppoe_header_t) + sizeof (ethernet_header_t); + pppoe_main_t *pem = &pppoe_main; + pppoe_session_t *t; + u32 session_id; + u8 *rw = 0; + + session_id = pem->session_index_by_sw_if_index[sw_if_index]; + t = pool_elt_at_index (pem->sessions, session_id); + + vec_validate_aligned (rw, len - 1, CLIB_CACHE_LINE_BYTES); + + ethernet_header_t *eth_hdr = (ethernet_header_t *) rw; + clib_memcpy (eth_hdr->dst_address, t->client_mac, 6); + clib_memcpy (eth_hdr->src_address, t->local_mac, 6); + eth_hdr->type = clib_host_to_net_u16 (ETHERNET_TYPE_PPPOE_SESSION); + + pppoe_header_t *pppoe = (pppoe_header_t *) (eth_hdr + 1); + pppoe->ver_type = PPPOE_VER_TYPE; + pppoe->code = 0; + pppoe->session_id = clib_host_to_net_u16 (t->session_id); + pppoe->length = 0; /* To be filled in at run-time */ + + switch (link_type) + { + case VNET_LINK_IP4: + pppoe->ppp_proto = clib_host_to_net_u16 (PPP_PROTOCOL_ip4); + break; + case VNET_LINK_IP6: + pppoe->ppp_proto = clib_host_to_net_u16 (PPP_PROTOCOL_ip6); + break; + default: + break; + } + + return rw; +} + +/** + * @brief Fixup the adj rewrite post encap. Insert the packet's length + */ +static void +pppoe_fixup (vlib_main_t * vm, ip_adjacency_t * adj, vlib_buffer_t * b0) +{ + pppoe_header_t *pppoe0; + + pppoe0 = vlib_buffer_get_current (b0); + + pppoe0->length = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0) + - sizeof (pppoe_header_t) + - sizeof (ethernet_header_t)); +} + +static void +pppoe_update_adj (vnet_main_t * vnm, u32 sw_if_index, adj_index_t ai) +{ + pppoe_main_t *pem = &pppoe_main; + dpo_id_t dpo = DPO_INVALID; + ip_adjacency_t *adj; + pppoe_session_t *t; + u32 session_id; + + ASSERT (ADJ_INDEX_INVALID != ai); + + adj = adj_get (ai); + + switch (adj->lookup_next_index) + { + case IP_LOOKUP_NEXT_ARP: + case IP_LOOKUP_NEXT_GLEAN: + adj_nbr_midchain_update_rewrite (ai, pppoe_fixup, + ADJ_FLAG_NONE, + pppoe_build_rewrite (vnm, + sw_if_index, + adj->ia_link, + NULL)); + break; + case IP_LOOKUP_NEXT_MCAST: + /* + * Construct a partial rewrite from the known ethernet mcast dest MAC + * There's no MAC fixup, so the last 2 parameters are 0 + */ + adj_mcast_midchain_update_rewrite (ai, pppoe_fixup, + ADJ_FLAG_NONE, + pppoe_build_rewrite (vnm, + sw_if_index, + adj->ia_link, + NULL), 0, 0); + break; + + case IP_LOOKUP_NEXT_DROP: + case IP_LOOKUP_NEXT_PUNT: + case IP_LOOKUP_NEXT_LOCAL: + case IP_LOOKUP_NEXT_REWRITE: + case IP_LOOKUP_NEXT_MIDCHAIN: + case IP_LOOKUP_NEXT_MCAST_MIDCHAIN: + case IP_LOOKUP_NEXT_ICMP_ERROR: + case IP_LOOKUP_N_NEXT: + ASSERT (0); + break; + } + + session_id = pem->session_index_by_sw_if_index[sw_if_index]; + t = pool_elt_at_index (pem->sessions, session_id); + interface_tx_dpo_add_or_lock (vnet_link_to_dpo_proto (adj->ia_link), + t->encap_if_index, &dpo); + + adj_nbr_midchain_stack (ai, &dpo); + + dpo_reset (&dpo); +} + +/* *INDENT-OFF* */ +VNET_HW_INTERFACE_CLASS (pppoe_hw_class) = +{ + .name = "PPPPOE", + .format_header = format_pppoe_header_with_length, + .build_rewrite = pppoe_build_rewrite, + .update_adjacency = pppoe_update_adj, + .flags = VNET_HW_INTERFACE_CLASS_FLAG_P2P, +}; +/* *INDENT-ON* */ + +#define foreach_copy_field \ +_(session_id) \ +_(encap_if_index) \ +_(decap_fib_index) \ +_(client_ip) + +static bool +pppoe_decap_next_is_valid (pppoe_main_t * pem, u32 is_ip6, + u32 decap_fib_index) +{ + vlib_main_t *vm = pem->vlib_main; + u32 input_idx = (!is_ip6) ? ip4_input_node.index : ip6_input_node.index; + vlib_node_runtime_t *r = vlib_node_get_runtime (vm, input_idx); + + return decap_fib_index < r->n_next_nodes; +} + +int vnet_pppoe_add_del_session + (vnet_pppoe_add_del_session_args_t * a, u32 * sw_if_indexp) +{ + pppoe_main_t *pem = &pppoe_main; + pppoe_session_t *t = 0; + vnet_main_t *vnm = pem->vnet_main; + u32 hw_if_index = ~0; + u32 sw_if_index = ~0; + u32 is_ip6 = a->is_ip6; + pppoe_entry_key_t cached_key; + pppoe_entry_result_t cached_result; + u32 bucket; + pppoe_entry_key_t key; + pppoe_entry_result_t result; + vnet_hw_interface_t *hi; + vnet_sw_interface_t *si; + fib_prefix_t pfx; + + cached_key.raw = ~0; + cached_result.raw = ~0; /* warning be gone */ + memset (&pfx, 0, sizeof (pfx)); + + if (!is_ip6) + { + pfx.fp_addr.ip4.as_u32 = a->client_ip.ip4.as_u32; + pfx.fp_len = 32; + pfx.fp_proto = FIB_PROTOCOL_IP4; + } + else + { + pfx.fp_addr.ip6.as_u64[0] = a->client_ip.ip6.as_u64[0]; + pfx.fp_addr.ip6.as_u64[1] = a->client_ip.ip6.as_u64[1]; + pfx.fp_len = 128; + pfx.fp_proto = FIB_PROTOCOL_IP6; + } + + /* Get encap_if_index and local mac address */ + pppoe_lookup_1 (&pem->session_table, &cached_key, &cached_result, + a->client_mac, clib_host_to_net_u16 (a->session_id), + &key, &bucket, &result); + a->encap_if_index = result.fields.sw_if_index; + + if (a->encap_if_index == ~0) + return VNET_API_ERROR_INVALID_SW_IF_INDEX; + + si = vnet_get_sw_interface (vnm, a->encap_if_index); + hi = vnet_get_hw_interface (vnm, si->hw_if_index); + + + if (a->is_add) + { + /* adding a session: session must not already exist */ + if (result.fields.session_index != ~0) + return VNET_API_ERROR_TUNNEL_EXIST; + + /*if not set explicitly, default to ip4 */ + if (!pppoe_decap_next_is_valid (pem, is_ip6, a->decap_fib_index)) + return VNET_API_ERROR_INVALID_DECAP_NEXT; + + pool_get_aligned (pem->sessions, t, CLIB_CACHE_LINE_BYTES); + memset (t, 0, sizeof (*t)); + + clib_memcpy (t->local_mac, hi->hw_address, 6); + + /* copy from arg structure */ +#define _(x) t->x = a->x; + foreach_copy_field; +#undef _ + + clib_memcpy (t->client_mac, a->client_mac, 6); + + /* update pppoe fib with session_index */ + result.fields.session_index = t - pem->sessions; + pppoe_update_1 (&pem->session_table, + a->client_mac, clib_host_to_net_u16 (a->session_id), + &key, &bucket, &result); + + vnet_hw_interface_t *hi; + if (vec_len (pem->free_pppoe_session_hw_if_indices) > 0) + { + vnet_interface_main_t *im = &vnm->interface_main; + hw_if_index = pem->free_pppoe_session_hw_if_indices + [vec_len (pem->free_pppoe_session_hw_if_indices) - 1]; + _vec_len (pem->free_pppoe_session_hw_if_indices) -= 1; + + hi = vnet_get_hw_interface (vnm, hw_if_index); + hi->dev_instance = t - pem->sessions; + hi->hw_instance = hi->dev_instance; + + /* clear old stats of freed session before reuse */ + sw_if_index = hi->sw_if_index; + vnet_interface_counter_lock (im); + vlib_zero_combined_counter + (&im->combined_sw_if_counters[VNET_INTERFACE_COUNTER_TX], + sw_if_index); + vlib_zero_combined_counter (&im->combined_sw_if_counters + [VNET_INTERFACE_COUNTER_RX], + sw_if_index); + vlib_zero_simple_counter (&im->sw_if_counters + [VNET_INTERFACE_COUNTER_DROP], + sw_if_index); + vnet_interface_counter_unlock (im); + } + else + { + hw_if_index = vnet_register_interface + (vnm, pppoe_device_class.index, t - pem->sessions, + pppoe_hw_class.index, t - pem->sessions); + hi = vnet_get_hw_interface (vnm, hw_if_index); + } + + t->hw_if_index = hw_if_index; + t->sw_if_index = sw_if_index = hi->sw_if_index; + + vec_validate_init_empty (pem->session_index_by_sw_if_index, sw_if_index, + ~0); + pem->session_index_by_sw_if_index[sw_if_index] = t - pem->sessions; + + vnet_sw_interface_t *si = vnet_get_sw_interface (vnm, sw_if_index); + si->flags &= ~VNET_SW_INTERFACE_FLAG_HIDDEN; + vnet_sw_interface_set_flags (vnm, sw_if_index, + VNET_SW_INTERFACE_FLAG_ADMIN_UP); + + /* add reverse route for client ip */ + fib_table_entry_path_add (a->decap_fib_index, &pfx, + FIB_SOURCE_PLUGIN_HI, FIB_ENTRY_FLAG_NONE, + fib_proto_to_dpo (pfx.fp_proto), + &pfx.fp_addr, sw_if_index, ~0, + 1, NULL, FIB_ROUTE_PATH_FLAG_NONE); + + } + else + { + /* deleting a session: session must exist */ + if (result.fields.session_index == ~0) + return VNET_API_ERROR_NO_SUCH_ENTRY; + + t = pool_elt_at_index (pem->sessions, result.fields.session_index); + sw_if_index = t->sw_if_index; + + vnet_sw_interface_set_flags (vnm, t->sw_if_index, 0 /* down */ ); + vnet_sw_interface_t *si = vnet_get_sw_interface (vnm, t->sw_if_index); + si->flags |= VNET_SW_INTERFACE_FLAG_HIDDEN; + + vec_add1 (pem->free_pppoe_session_hw_if_indices, t->hw_if_index); + + pem->session_index_by_sw_if_index[t->sw_if_index] = ~0; + + /* update pppoe fib with session_inde=~0x */ + result.fields.session_index = ~0; + pppoe_update_1 (&pem->session_table, + a->client_mac, clib_host_to_net_u16 (a->session_id), + &key, &bucket, &result); + + + /* delete reverse route for client ip */ + fib_table_entry_path_remove (a->decap_fib_index, &pfx, + FIB_SOURCE_PLUGIN_HI, + fib_proto_to_dpo (pfx.fp_proto), + &pfx.fp_addr, + sw_if_index, ~0, 1, + FIB_ROUTE_PATH_FLAG_NONE); + + pool_put (pem->sessions, t); + } + + if (sw_if_indexp) + *sw_if_indexp = sw_if_index; + + return 0; +} + +static clib_error_t * +pppoe_add_del_session_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + u16 session_id = 0; + ip46_address_t client_ip; + u8 is_add = 1; + u8 client_ip_set = 0; + u8 ipv4_set = 0; + u8 ipv6_set = 0; + u32 encap_if_index = 0; + u32 decap_fib_index = 0; + u8 client_mac[6] = { 0 }; + u8 client_mac_set = 0; + int rv; + u32 tmp; + vnet_pppoe_add_del_session_args_t _a, *a = &_a; + u32 session_sw_if_index; + clib_error_t *error = NULL; + + /* Cant "universally zero init" (={0}) due to GCC bug 53119 */ + memset (&client_ip, 0, sizeof client_ip); + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "del")) + { + is_add = 0; + } + else if (unformat (line_input, "session-id %d", &session_id)) + ; + else if (unformat (line_input, "client-ip %U", + unformat_ip4_address, &client_ip.ip4)) + { + client_ip_set = 1; + ipv4_set = 1; + } + else if (unformat (line_input, "client-ip %U", + unformat_ip6_address, &client_ip.ip6)) + { + client_ip_set = 1; + ipv6_set = 1; + } + else if (unformat (line_input, "decap-vrf-id %d", &tmp)) + { + if (ipv6_set) + decap_fib_index = fib_table_find (FIB_PROTOCOL_IP6, tmp); + else + decap_fib_index = fib_table_find (FIB_PROTOCOL_IP4, tmp); + + if (decap_fib_index == ~0) + { + error = + clib_error_return (0, "nonexistent decap fib id %d", tmp); + goto done; + } + } + else + if (unformat + (line_input, "client-mac %U", unformat_ethernet_address, + client_mac)) + client_mac_set = 1; + else + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } + } + + if (client_ip_set == 0) + { + error = + clib_error_return (0, "session client ip address not specified"); + goto done; + } + + if (ipv4_set && ipv6_set) + { + error = clib_error_return (0, "both IPv4 and IPv6 addresses specified"); + goto done; + } + + if (client_mac_set == 0) + { + error = clib_error_return (0, "session client mac not specified"); + goto done; + } + + memset (a, 0, sizeof (*a)); + + a->is_add = is_add; + a->is_ip6 = ipv6_set; + +#define _(x) a->x = x; + foreach_copy_field; +#undef _ + + clib_memcpy (a->client_mac, client_mac, 6); + + rv = vnet_pppoe_add_del_session (a, &session_sw_if_index); + + switch (rv) + { + case 0: + if (is_add) + vlib_cli_output (vm, "%U\n", format_vnet_sw_if_index_name, + vnet_get_main (), session_sw_if_index); + break; + + case VNET_API_ERROR_TUNNEL_EXIST: + error = clib_error_return (0, "session already exists..."); + goto done; + + case VNET_API_ERROR_NO_SUCH_ENTRY: + error = clib_error_return (0, "session does not exist..."); + goto done; + + default: + error = clib_error_return + (0, "vnet_pppoe_add_del_session returned %d", rv); + goto done; + } + +done: + unformat_free (line_input); + + return error; +} + +/*? + * Add or delete a PPPPOE Session. + * + * @cliexpar + * Example of how to create a PPPPOE Session: + * @cliexcmd{create pppoe session client-ip 10.0.3.1 session-id 13 + * client-mac 00:01:02:03:04:05 } + * Example of how to delete a PPPPOE Session: + * @cliexcmd{create pppoe session client-ip 10.0.3.1 session-id 13 + * client-mac 00:01:02:03:04:05 del } + ?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (create_pppoe_session_command, static) = { + .path = "create pppoe session", + .short_help = + "create pppoe session client-ip <client-ip> session-id <nn>" + " client-mac <client-mac> [decap-vrf-id <nn>] [del]", + .function = pppoe_add_del_session_command_fn, +}; +/* *INDENT-ON* */ + +/* *INDENT-OFF* */ +static clib_error_t * +show_pppoe_session_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + pppoe_main_t *pem = &pppoe_main; + pppoe_session_t *t; + + if (pool_elts (pem->sessions) == 0) + vlib_cli_output (vm, "No pppoe sessions configured..."); + + pool_foreach (t, pem->sessions, + ({ + vlib_cli_output (vm, "%U",format_pppoe_session, t); + })); + + return 0; +} +/* *INDENT-ON* */ + +/*? + * Display all the PPPPOE Session entries. + * + * @cliexpar + * Example of how to display the PPPPOE Session entries: + * @cliexstart{show pppoe session} + * [0] client-ip 10.0.3.1 session_id 13 encap-if-index 0 decap-vrf-id 13 sw_if_index 5 + * local-mac a0:b0:c0:d0:e0:f0 client-mac 00:01:02:03:04:05 + * @cliexend + ?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_pppoe_session_command, static) = { + .path = "show pppoe session", + .short_help = "show pppoe session", + .function = show_pppoe_session_command_fn, +}; +/* *INDENT-ON* */ + +/** Display the contents of the PPPoE Fib. */ +static clib_error_t * +show_pppoe_fib_command_fn (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + pppoe_main_t *pem = &pppoe_main; + BVT (clib_bihash) * h = &pem->session_table; + BVT (clib_bihash_bucket) * b; + BVT (clib_bihash_value) * v; + pppoe_entry_key_t key; + pppoe_entry_result_t result; + u32 first_entry = 1; + u64 total_entries = 0; + int i, j, k; + u8 *s = 0; + + for (i = 0; i < h->nbuckets; i++) + { + b = &h->buckets[i]; + if (b->offset == 0) + continue; + v = BV (clib_bihash_get_value) (h, b->offset); + for (j = 0; j < (1 << b->log2_pages); j++) + { + for (k = 0; k < BIHASH_KVP_PER_PAGE; k++) + { + if (v->kvp[k].key == ~0ULL && v->kvp[k].value == ~0ULL) + continue; + + if (first_entry) + { + first_entry = 0; + vlib_cli_output (vm, + "%=19s%=12s%=13s%=14s", + "Mac-Address", "session_id", "sw_if_index", + "session_index"); + } + + key.raw = v->kvp[k].key; + result.raw = v->kvp[k].value; + + + vlib_cli_output (vm, + "%=19U%=12d%=13d%=14d", + format_ethernet_address, key.fields.mac, + clib_net_to_host_u16 (key.fields.session_id), + result.fields.sw_if_index == ~0 + ? -1 : result.fields.sw_if_index, + result.fields.session_index == ~0 + ? -1 : result.fields.session_index); + vec_reset_length (s); + total_entries++; + } + v++; + } + } + + if (total_entries == 0) + vlib_cli_output (vm, "no pppoe fib entries"); + else + vlib_cli_output (vm, "%lld pppoe fib entries", total_entries); + + vec_free (s); + return 0; +} + +/*? + * This command dispays the MAC Address entries of the PPPoE FIB table. + * Output can be filtered to just get the number of MAC Addresses or display + * each MAC Address. + * + * @cliexpar + * Example of how to display the number of MAC Address entries in the PPPoE + * FIB table: + * @cliexstart{show pppoe fib} + * Mac Address session_id Interface sw_if_index session_index + * 52:54:00:53:18:33 1 GigabitEthernet0/8/0 2 0 + * 52:54:00:53:18:55 2 GigabitEthernet0/8/1 3 1 + * @cliexend +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_pppoe_fib_command, static) = { + .path = "show pppoe fib", + .short_help = "show pppoe fib", + .function = show_pppoe_fib_command_fn, +}; +/* *INDENT-ON* */ + +clib_error_t * +pppoe_init (vlib_main_t * vm) +{ + pppoe_main_t *pem = &pppoe_main; + + pem->vnet_main = vnet_get_main (); + pem->vlib_main = vm; + + /* Create the hash table */ + BV (clib_bihash_init) (&pem->session_table, "pppoe session table", + PPPOE_NUM_BUCKETS, PPPOE_MEMORY_SIZE); + + ethernet_register_input_type (vm, ETHERNET_TYPE_PPPOE_SESSION, + pppoe_input_node.index); + + ethernet_register_input_type (vm, ETHERNET_TYPE_PPPOE_DISCOVERY, + pppoe_tap_dispatch_node.index); + + return 0; +} + +VLIB_INIT_FUNCTION (pppoe_init); + +/* *INDENT-OFF* */ +VLIB_PLUGIN_REGISTER () = { + .version = VPP_BUILD_VER, + .description = "PPPoE", +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/pppoe/pppoe.h b/src/plugins/pppoe/pppoe.h new file mode 100644 index 00000000..b06c068f --- /dev/null +++ b/src/plugins/pppoe/pppoe.h @@ -0,0 +1,289 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2017 Intel and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#ifndef _PPPOE_H +#define _PPPOE_H + +#include <vnet/plugin/plugin.h> +#include <vppinfra/lock.h> +#include <vppinfra/error.h> +#include <vppinfra/hash.h> +#include <vnet/vnet.h> +#include <vnet/ip/ip.h> +#include <vnet/ethernet/ethernet.h> +#include <vnet/ip/ip4_packet.h> +#include <vnet/ip/ip6_packet.h> +#include <vnet/dpo/dpo.h> +#include <vnet/adj/adj_types.h> +#include <vnet/fib/fib_table.h> +#include <vlib/vlib.h> +#include <vppinfra/bihash_8_8.h> + + +typedef struct +{ + u8 ver_type; + u8 code; + u16 session_id; + u16 length; + u16 ppp_proto; +} pppoe_header_t; + +#define PPPOE_VER_TYPE 0x11 +#define PPPOE_PADS 0x65 + +typedef struct +{ + /* pppoe session_id in HOST byte order */ + u16 session_id; + + /* session client addresses */ + ip46_address_t client_ip; + + /* the index of tx interface for pppoe encaped packet */ + u32 encap_if_index; + + /** FIB indices - inner IP packet lookup here */ + u32 decap_fib_index; + + u8 local_mac[6]; + u8 client_mac[6]; + + /* vnet intfc index */ + u32 sw_if_index; + u32 hw_if_index; + +} pppoe_session_t; + +#define foreach_pppoe_input_next \ +_(DROP, "error-drop") \ +_(IP4_INPUT, "ip4-input") \ +_(IP6_INPUT, "ip6-input" ) \ +_(CP_INPUT, "pppoe-tap-dispatch" ) \ + +typedef enum +{ +#define _(s,n) PPPOE_INPUT_NEXT_##s, + foreach_pppoe_input_next +#undef _ + PPPOE_INPUT_N_NEXT, +} pppoe_input_next_t; + +typedef enum +{ +#define pppoe_error(n,s) PPPOE_ERROR_##n, +#include <pppoe/pppoe_error.def> +#undef pppoe_error + PPPOE_N_ERROR, +} pppoe_input_error_t; + + +#define MTU 1500 +#define MTU_BUFFERS ((MTU + VLIB_BUFFER_DATA_SIZE - 1) / VLIB_BUFFER_DATA_SIZE) +#define NUM_BUFFERS_TO_ALLOC 32 + +/* + * The size of pppoe session table + */ +#define PPPOE_NUM_BUCKETS (128 * 1024) +#define PPPOE_MEMORY_SIZE (16<<20) + +/* *INDENT-OFF* */ +/* + * The PPPoE key is the mac address and session ID + */ +typedef struct +{ + union + { + struct + { + u16 session_id; + u8 mac[6]; + } fields; + struct + { + u32 w0; + u32 w1; + } words; + u64 raw; + }; +} pppoe_entry_key_t; +/* *INDENT-ON* */ + +/* *INDENT-OFF* */ +/* + * The PPPoE entry results + */ +typedef struct +{ + union + { + struct + { + u32 sw_if_index; + + u32 session_index; + + } fields; + u64 raw; + }; +} pppoe_entry_result_t; +/* *INDENT-ON* */ + +typedef struct +{ + /* For DP: vector of encap session instances, */ + pppoe_session_t *sessions; + + /* For CP: vector of CP path */ + BVT (clib_bihash) session_table; + + /* Free vlib hw_if_indices */ + u32 *free_pppoe_session_hw_if_indices; + + /* Mapping from sw_if_index to session index */ + u32 *session_index_by_sw_if_index; + + /* used for pppoe cp path */ + u32 tap_if_index; + + /* API message ID base */ + u16 msg_id_base; + + /* convenience */ + vlib_main_t *vlib_main; + vnet_main_t *vnet_main; + +} pppoe_main_t; + +extern pppoe_main_t pppoe_main; + +extern vlib_node_registration_t pppoe_input_node; +extern vlib_node_registration_t pppoe_tap_dispatch_node; + +typedef struct +{ + u8 is_add; + u8 is_ip6; + u16 session_id; + ip46_address_t client_ip; + u32 encap_if_index; + u32 decap_fib_index; + u8 local_mac[6]; + u8 client_mac[6]; +} vnet_pppoe_add_del_session_args_t; + +int vnet_pppoe_add_del_session + (vnet_pppoe_add_del_session_args_t * a, u32 * sw_if_indexp); + +typedef struct +{ + u8 is_add; + u32 client_if_index; + u32 tap_if_index; +} vnet_pppoe_add_del_tap_args_t; + +always_inline u64 +pppoe_make_key (u8 * mac_address, u16 session_id) +{ + u64 temp; + + /* + * The mac address in memory is A:B:C:D:E:F + * The session_id in register is H:L + */ +#if CLIB_ARCH_IS_LITTLE_ENDIAN + /* + * Create the in-register key as F:E:D:C:B:A:H:L + * In memory the key is L:H:A:B:C:D:E:F + */ + temp = *((u64 *) (mac_address)) << 16; + temp = (temp & ~0xffff) | (u64) (session_id); +#else + /* + * Create the in-register key as H:L:A:B:C:D:E:F + * In memory the key is H:L:A:B:C:D:E:F + */ + temp = *((u64 *) (mac_address)) >> 16; + temp = temp | (((u64) session_id) << 48); +#endif + + return temp; +} + +static_always_inline void +pppoe_lookup_1 (BVT (clib_bihash) * session_table, + pppoe_entry_key_t * cached_key, + pppoe_entry_result_t * cached_result, + u8 * mac0, + u16 session_id0, + pppoe_entry_key_t * key0, + u32 * bucket0, pppoe_entry_result_t * result0) +{ + /* set up key */ + key0->raw = pppoe_make_key (mac0, session_id0); + *bucket0 = ~0; + + if (key0->raw == cached_key->raw) + { + /* Hit in the one-entry cache */ + result0->raw = cached_result->raw; + } + else + { + /* Do a regular session table lookup */ + BVT (clib_bihash_kv) kv; + + kv.key = key0->raw; + kv.value = ~0ULL; + BV (clib_bihash_search_inline) (session_table, &kv); + result0->raw = kv.value; + + /* Update one-entry cache */ + cached_key->raw = key0->raw; + cached_result->raw = result0->raw; + } +} + +static_always_inline void +pppoe_update_1 (BVT (clib_bihash) * session_table, + u8 * mac0, + u16 session_id0, + pppoe_entry_key_t * key0, + u32 * bucket0, pppoe_entry_result_t * result0) +{ + /* set up key */ + key0->raw = pppoe_make_key (mac0, session_id0); + *bucket0 = ~0; + + /* Update the entry */ + BVT (clib_bihash_kv) kv; + kv.key = key0->raw; + kv.value = result0->raw; + BV (clib_bihash_add_del) (session_table, &kv, 1 /* is_add */ ); + +} +#endif /* _PPPOE_H */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/pppoe/pppoe_all_api_h.h b/src/plugins/pppoe/pppoe_all_api_h.h new file mode 100644 index 00000000..393c7680 --- /dev/null +++ b/src/plugins/pppoe/pppoe_all_api_h.h @@ -0,0 +1,18 @@ +/* + * pppoe_all_api_h.h - plug-in api #include file + * + * Copyright (c) 2017 Intel and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* Include the generated file, see BUILT_SOURCES in Makefile.am */ +#include <pppoe/pppoe.api.h> diff --git a/src/plugins/pppoe/pppoe_api.c b/src/plugins/pppoe/pppoe_api.c new file mode 100644 index 00000000..9b758460 --- /dev/null +++ b/src/plugins/pppoe/pppoe_api.c @@ -0,0 +1,224 @@ +/* + *------------------------------------------------------------------ + * pppoe_api.c - pppoe api + * + * Copyright (c) 2017 Intel and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#include <vnet/interface.h> +#include <vnet/api_errno.h> +#include <vnet/feature/feature.h> +#include <vnet/fib/fib_table.h> + +#include <vppinfra/byte_order.h> +#include <vlibmemory/api.h> +#include <vlibsocket/api.h> + +#include <pppoe/pppoe.h> + + +#define vl_msg_id(n,h) n, +typedef enum +{ +#include <pppoe/pppoe.api.h> + /* We'll want to know how many messages IDs we need... */ + VL_MSG_FIRST_AVAILABLE, +} vl_msg_id_t; +#undef vl_msg_id + +/* define message structures */ +#define vl_typedefs +#include <pppoe/pppoe.api.h> +#undef vl_typedefs + +/* define generated endian-swappers */ +#define vl_endianfun +#include <pppoe/pppoe.api.h> +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__) +#define vl_printfun +#include <pppoe/pppoe.api.h> +#undef vl_printfun + +/* Get the API version number */ +#define vl_api_version(n,v) static u32 api_version=(v); +#include <pppoe/pppoe.api.h> +#undef vl_api_version + +#define vl_msg_name_crc_list +#include <pppoe/pppoe.api.h> +#undef vl_msg_name_crc_list + +#define REPLY_MSG_ID_BASE pem->msg_id_base +#include <vlibapi/api_helper_macros.h> + +static void +setup_message_id_table (pppoe_main_t * pem, api_main_t * am) +{ +#define _(id,n,crc) \ + vl_msg_api_add_msg_name_crc (am, #n "_" #crc, id + pem->msg_id_base); + foreach_vl_msg_name_crc_pppoe; +#undef _ +} + +#define foreach_pppoe_plugin_api_msg \ +_(PPPOE_ADD_DEL_SESSION, pppoe_add_del_session) \ +_(PPPOE_SESSION_DUMP, pppoe_session_dump) + +static void vl_api_pppoe_add_del_session_t_handler + (vl_api_pppoe_add_del_session_t * mp) +{ + vl_api_pppoe_add_del_session_reply_t *rmp; + int rv = 0; + u32 decap_fib_index; + ip4_main_t *im = &ip4_main; + pppoe_main_t *pem = &pppoe_main; + + uword *p = hash_get (im->fib_index_by_table_id, ntohl (mp->decap_vrf_id)); + if (!p) + { + rv = VNET_API_ERROR_NO_SUCH_INNER_FIB; + goto out; + } + decap_fib_index = p[0]; + + vnet_pppoe_add_del_session_args_t a = { + .is_add = mp->is_add, + .is_ip6 = mp->is_ipv6, + .decap_fib_index = decap_fib_index, + .session_id = ntohs (mp->session_id), + .client_ip = to_ip46 (mp->is_ipv6, mp->client_ip), + }; + clib_memcpy (a.client_mac, mp->client_mac, 6); + + u32 sw_if_index = ~0; + rv = vnet_pppoe_add_del_session (&a, &sw_if_index); + +out: + /* *INDENT-OFF* */ + REPLY_MACRO2(VL_API_PPPOE_ADD_DEL_SESSION_REPLY, + ({ + rmp->sw_if_index = ntohl (sw_if_index); + })); + /* *INDENT-ON* */ +} + +static void send_pppoe_session_details + (pppoe_session_t * t, unix_shared_memory_queue_t * q, u32 context) +{ + vl_api_pppoe_session_details_t *rmp; + ip4_main_t *im4 = &ip4_main; + ip6_main_t *im6 = &ip6_main; + u8 is_ipv6 = !ip46_address_is_ip4 (&t->client_ip); + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (VL_API_PPPOE_SESSION_DETAILS); + if (is_ipv6) + { + memcpy (rmp->client_ip, t->client_ip.ip6.as_u8, 16); + rmp->decap_vrf_id = htonl (im6->fibs[t->decap_fib_index].ft_table_id); + } + else + { + memcpy (rmp->client_ip, t->client_ip.ip4.as_u8, 4); + rmp->decap_vrf_id = htonl (im4->fibs[t->decap_fib_index].ft_table_id); + } + rmp->session_id = htons (t->session_id); + rmp->encap_if_index = htonl (t->encap_if_index); + clib_memcpy (rmp->local_mac, t->local_mac, 6); + clib_memcpy (rmp->client_mac, t->client_mac, 6); + rmp->sw_if_index = htonl (t->sw_if_index); + rmp->is_ipv6 = is_ipv6; + rmp->context = context; + + vl_msg_api_send_shmem (q, (u8 *) & rmp); +} + +static void +vl_api_pppoe_session_dump_t_handler (vl_api_pppoe_session_dump_t * mp) +{ + unix_shared_memory_queue_t *q; + pppoe_main_t *pem = &pppoe_main; + pppoe_session_t *t; + u32 sw_if_index; + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (q == 0) + { + return; + } + + sw_if_index = ntohl (mp->sw_if_index); + + if (~0 == sw_if_index) + { + /* *INDENT-OFF* */ + pool_foreach (t, pem->sessions, + ({ + send_pppoe_session_details(t, q, mp->context); + })); + /* *INDENT-ON* */ + } + else + { + if ((sw_if_index >= vec_len (pem->session_index_by_sw_if_index)) || + (~0 == pem->session_index_by_sw_if_index[sw_if_index])) + { + return; + } + t = &pem->sessions[pem->session_index_by_sw_if_index[sw_if_index]]; + send_pppoe_session_details (t, q, mp->context); + } +} + + +static clib_error_t * +pppoe_api_hookup (vlib_main_t * vm) +{ + pppoe_main_t *pem = &pppoe_main; + + u8 *name = format (0, "pppoe_%08x%c", api_version, 0); + pem->msg_id_base = vl_msg_api_get_msg_ids + ((char *) name, VL_MSG_FIRST_AVAILABLE); + +#define _(N,n) \ + vl_msg_api_set_handlers((VL_API_##N + pem->msg_id_base), \ + #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_pppoe_plugin_api_msg; +#undef _ + + /* Add our API messages to the global name_crc hash table */ + setup_message_id_table (pem, &api_main); + + return 0; +} + +VLIB_API_INIT_FUNCTION (pppoe_api_hookup); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/pppoe/pppoe_decap.c b/src/plugins/pppoe/pppoe_decap.c new file mode 100644 index 00000000..02c82711 --- /dev/null +++ b/src/plugins/pppoe/pppoe_decap.c @@ -0,0 +1,422 @@ +/* + * decap.c: pppoe session decap packet processing + * + * Copyright (c) 2017 Intel and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vlib/vlib.h> +#include <vnet/pg/pg.h> +#include <vnet/ppp/packet.h> +#include <pppoe/pppoe.h> + +typedef struct { + u32 next_index; + u32 session_index; + u32 session_id; + u32 error; +} pppoe_rx_trace_t; + +static u8 * format_pppoe_rx_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + pppoe_rx_trace_t * t = va_arg (*args, pppoe_rx_trace_t *); + + if (t->session_index != ~0) + { + s = format (s, "PPPoE decap from pppoe_session%d session_id %d next %d error %d", + t->session_index, t->session_id, t->next_index, t->error); + } + else + { + s = format (s, "PPPoE decap error - session for session_id %d does not exist", + t->session_id); + } + return s; +} + +static uword +pppoe_input (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + u32 n_left_from, next_index, * from, * to_next; + pppoe_main_t * pem = &pppoe_main; + vnet_main_t * vnm = pem->vnet_main; + vnet_interface_main_t * im = &vnm->interface_main; + u32 pkts_decapsulated = 0; + u32 thread_index = vlib_get_thread_index(); + u32 stats_sw_if_index, stats_n_packets, stats_n_bytes; + pppoe_entry_key_t cached_key; + pppoe_entry_result_t cached_result; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + /* Clear the one-entry cache in case session table was updated */ + cached_key.raw = ~0; + cached_result.raw = ~0; /* warning be gone */ + + next_index = node->cached_next_index; + stats_sw_if_index = node->runtime_data[0]; + stats_n_packets = stats_n_bytes = 0; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, + to_next, n_left_to_next); + while (n_left_from >= 4 && n_left_to_next >= 2) + { + u32 bi0, bi1; + vlib_buffer_t * b0, * b1; + u32 next0, next1; + ethernet_header_t *h0, *h1; + pppoe_header_t * pppoe0, * pppoe1; + u16 ppp_proto0 = 0, ppp_proto1 = 0; + pppoe_session_t * t0, * t1; + u32 error0, error1; + u32 sw_if_index0, sw_if_index1, len0, len1; + pppoe_entry_key_t key0, key1; + pppoe_entry_result_t result0, result1; + u32 bucket0, bucket1; + + /* Prefetch next iteration. */ + { + vlib_buffer_t * p2, * p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + + CLIB_PREFETCH (p2->data, 2*CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (p3->data, 2*CLIB_CACHE_LINE_BYTES, LOAD); + } + + bi0 = from[0]; + bi1 = from[1]; + to_next[0] = bi0; + to_next[1] = bi1; + from += 2; + to_next += 2; + n_left_to_next -= 2; + n_left_from -= 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + error0 = 0; + error1 = 0; + + /* leaves current_data pointing at the pppoe header */ + pppoe0 = vlib_buffer_get_current (b0); + pppoe1 = vlib_buffer_get_current (b1); + ppp_proto0 = clib_net_to_host_u16(pppoe0->ppp_proto); + ppp_proto1 = clib_net_to_host_u16(pppoe1->ppp_proto); + + /* Manipulate packet 0 */ + if ((ppp_proto0 != PPP_PROTOCOL_ip4) + && (ppp_proto0 != PPP_PROTOCOL_ip6)) + { + error0 = PPPOE_ERROR_CONTROL_PLANE; + next0 = PPPOE_INPUT_NEXT_CP_INPUT; + goto trace0; + } + + /* get client mac */ + vlib_buffer_reset(b0); + h0 = vlib_buffer_get_current (b0); + + pppoe_lookup_1 (&pem->session_table, &cached_key, &cached_result, + h0->src_address, pppoe0->session_id, + &key0, &bucket0, &result0); + if (PREDICT_FALSE (result0.fields.session_index == ~0)) + { + error0 = PPPOE_ERROR_NO_SUCH_SESSION; + next0 = PPPOE_INPUT_NEXT_DROP; + goto trace0; + } + + t0 = pool_elt_at_index (pem->sessions, + result0.fields.session_index); + + /* Pop Eth and PPPPoE header */ + vlib_buffer_advance(b0, sizeof(*h0)+sizeof(*pppoe0)); + + next0 = (ppp_proto0==PPP_PROTOCOL_ip4)? + PPPOE_INPUT_NEXT_IP4_INPUT + : PPPOE_INPUT_NEXT_IP6_INPUT; + + sw_if_index0 = t0->sw_if_index; + len0 = vlib_buffer_length_in_chain (vm, b0); + + pkts_decapsulated ++; + stats_n_packets += 1; + stats_n_bytes += len0; + + /* Batch stats increment on the same pppoe session so counter + is not incremented per packet */ + if (PREDICT_FALSE (sw_if_index0 != stats_sw_if_index)) + { + stats_n_packets -= 1; + stats_n_bytes -= len0; + if (stats_n_packets) + vlib_increment_combined_counter + (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, + thread_index, stats_sw_if_index, + stats_n_packets, stats_n_bytes); + stats_n_packets = 1; + stats_n_bytes = len0; + stats_sw_if_index = sw_if_index0; + } + + trace0: + b0->error = error0 ? node->errors[error0] : 0; + + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + pppoe_rx_trace_t *tr + = vlib_add_trace (vm, node, b0, sizeof (*tr)); + tr->next_index = next0; + tr->error = error0; + tr->session_index = result0.fields.session_index; + tr->session_id = clib_net_to_host_u32(pppoe0->session_id); + } + + + /* Manipulate packet 1 */ + if ((ppp_proto1 != PPP_PROTOCOL_ip4) + && (ppp_proto1 != PPP_PROTOCOL_ip6)) + { + error1 = PPPOE_ERROR_CONTROL_PLANE; + next1 = PPPOE_INPUT_NEXT_CP_INPUT; + goto trace1; + } + + /* get client mac */ + vlib_buffer_reset(b1); + h1 = vlib_buffer_get_current (b1); + + pppoe_lookup_1 (&pem->session_table, &cached_key, &cached_result, + h1->src_address, pppoe1->session_id, + &key1, &bucket1, &result1); + if (PREDICT_FALSE (result1.fields.session_index == ~0)) + { + error1 = PPPOE_ERROR_NO_SUCH_SESSION; + next1 = PPPOE_INPUT_NEXT_DROP; + goto trace1; + } + + t1 = pool_elt_at_index (pem->sessions, + result1.fields.session_index); + + /* Pop Eth and PPPPoE header */ + vlib_buffer_advance(b1, sizeof(*h1)+sizeof(*pppoe1)); + + next1 = (ppp_proto1==PPP_PROTOCOL_ip4)? + PPPOE_INPUT_NEXT_IP4_INPUT + : PPPOE_INPUT_NEXT_IP6_INPUT; + + sw_if_index1 = t1->sw_if_index; + len1 = vlib_buffer_length_in_chain (vm, b1); + + pkts_decapsulated ++; + stats_n_packets += 1; + stats_n_bytes += len1; + + /* Batch stats increment on the same pppoe session so counter + is not incremented per packet */ + if (PREDICT_FALSE (sw_if_index1 != stats_sw_if_index)) + { + stats_n_packets -= 1; + stats_n_bytes -= len1; + if (stats_n_packets) + vlib_increment_combined_counter + (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, + thread_index, stats_sw_if_index, + stats_n_packets, stats_n_bytes); + stats_n_packets = 1; + stats_n_bytes = len1; + stats_sw_if_index = sw_if_index1; + } + + trace1: + b1->error = error1 ? node->errors[error1] : 0; + + if (PREDICT_FALSE(b1->flags & VLIB_BUFFER_IS_TRACED)) + { + pppoe_rx_trace_t *tr + = vlib_add_trace (vm, node, b1, sizeof (*tr)); + tr->next_index = next1; + tr->error = error1; + tr->session_index = result1.fields.session_index; + tr->session_id = clib_net_to_host_u32(pppoe1->session_id); + } + + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, + to_next, n_left_to_next, + bi0, bi1, next0, next1); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t * b0; + u32 next0; + ethernet_header_t *h0; + pppoe_header_t * pppoe0; + u16 ppp_proto0 = 0; + pppoe_session_t * t0; + u32 error0; + u32 sw_if_index0, len0; + pppoe_entry_key_t key0; + pppoe_entry_result_t result0; + u32 bucket0; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + error0 = 0; + + /* leaves current_data pointing at the pppoe header */ + pppoe0 = vlib_buffer_get_current (b0); + ppp_proto0 = clib_net_to_host_u16(pppoe0->ppp_proto); + + if ((ppp_proto0 != PPP_PROTOCOL_ip4) + && (ppp_proto0 != PPP_PROTOCOL_ip6)) + { + error0 = PPPOE_ERROR_CONTROL_PLANE; + next0 = PPPOE_INPUT_NEXT_CP_INPUT; + goto trace00; + } + + /* get client mac */ + vlib_buffer_reset(b0); + h0 = vlib_buffer_get_current (b0); + + pppoe_lookup_1 (&pem->session_table, &cached_key, &cached_result, + h0->src_address, pppoe0->session_id, + &key0, &bucket0, &result0); + if (PREDICT_FALSE (result0.fields.session_index == ~0)) + { + error0 = PPPOE_ERROR_NO_SUCH_SESSION; + next0 = PPPOE_INPUT_NEXT_DROP; + goto trace00; + } + + t0 = pool_elt_at_index (pem->sessions, + result0.fields.session_index); + + /* Pop Eth and PPPPoE header */ + vlib_buffer_advance(b0, sizeof(*h0)+sizeof(*pppoe0)); + + next0 = (ppp_proto0==PPP_PROTOCOL_ip4)? + PPPOE_INPUT_NEXT_IP4_INPUT + : PPPOE_INPUT_NEXT_IP6_INPUT; + + sw_if_index0 = t0->sw_if_index; + len0 = vlib_buffer_length_in_chain (vm, b0); + + pkts_decapsulated ++; + stats_n_packets += 1; + stats_n_bytes += len0; + + /* Batch stats increment on the same pppoe session so counter + is not incremented per packet */ + if (PREDICT_FALSE (sw_if_index0 != stats_sw_if_index)) + { + stats_n_packets -= 1; + stats_n_bytes -= len0; + if (stats_n_packets) + vlib_increment_combined_counter + (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, + thread_index, stats_sw_if_index, + stats_n_packets, stats_n_bytes); + stats_n_packets = 1; + stats_n_bytes = len0; + stats_sw_if_index = sw_if_index0; + } + + trace00: + b0->error = error0 ? node->errors[error0] : 0; + + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + pppoe_rx_trace_t *tr + = vlib_add_trace (vm, node, b0, sizeof (*tr)); + tr->next_index = next0; + tr->error = error0; + tr->session_index = result0.fields.session_index; + tr->session_id = clib_net_to_host_u16(pppoe0->session_id); + } + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + /* Do we still need this now that session tx stats is kept? */ + vlib_node_increment_counter (vm, pppoe_input_node.index, + PPPOE_ERROR_DECAPSULATED, + pkts_decapsulated); + + /* Increment any remaining batch stats */ + if (stats_n_packets) + { + vlib_increment_combined_counter + (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); + node->runtime_data[0] = stats_sw_if_index; + } + + return from_frame->n_vectors; +} + +static char * pppoe_error_strings[] = { +#define pppoe_error(n,s) s, +#include <pppoe/pppoe_error.def> +#undef pppoe_error +#undef _ +}; + +VLIB_REGISTER_NODE (pppoe_input_node) = { + .function = pppoe_input, + .name = "pppoe-input", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + + .n_errors = PPPOE_N_ERROR, + .error_strings = pppoe_error_strings, + + .n_next_nodes = PPPOE_INPUT_N_NEXT, + .next_nodes = { +#define _(s,n) [PPPOE_INPUT_NEXT_##s] = n, + foreach_pppoe_input_next +#undef _ + }, + + .format_trace = format_pppoe_rx_trace, +}; + +VLIB_NODE_FUNCTION_MULTIARCH (pppoe_input_node, pppoe_input) + + diff --git a/src/plugins/pppoe/pppoe_error.def b/src/plugins/pppoe/pppoe_error.def new file mode 100644 index 00000000..a875afd0 --- /dev/null +++ b/src/plugins/pppoe/pppoe_error.def @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2017 Intel and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +pppoe_error (DECAPSULATED, "good packets decapsulated") +pppoe_error (CONTROL_PLANE, "control plane packet") +pppoe_error (NO_SUCH_SESSION, "no such sessions") +pppoe_error (BAD_VER_TYPE, "bad version and type in pppoe header") diff --git a/src/plugins/pppoe/pppoe_msg_enum.h b/src/plugins/pppoe/pppoe_msg_enum.h new file mode 100644 index 00000000..7ca19189 --- /dev/null +++ b/src/plugins/pppoe/pppoe_msg_enum.h @@ -0,0 +1,31 @@ +/* + * pppoe_msg_enum.h - vpp engine plug-in message enumeration + * + * Copyright (c) 2017 Intel and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef included_pppoe_msg_enum_h +#define included_pppoe_msg_enum_h + +#include <vppinfra/byte_order.h> + +#define vl_msg_id(n,h) n, +typedef enum +{ +#include <pppoe/pppoe_all_api_h.h> + /* We'll want to know how many messages IDs we need... */ + VL_MSG_FIRST_AVAILABLE, +} vl_msg_id_t; +#undef vl_msg_id + +#endif /* included_pppoe_msg_enum_h */ diff --git a/src/plugins/pppoe/pppoe_tap.c b/src/plugins/pppoe/pppoe_tap.c new file mode 100644 index 00000000..60cdaafb --- /dev/null +++ b/src/plugins/pppoe/pppoe_tap.c @@ -0,0 +1,89 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2017 Intel and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#include <pppoe/pppoe.h> +#include <vnet/unix/tapcli.h> + +static clib_error_t * +pppoe_add_del_tap_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + pppoe_main_t *pem = &pppoe_main; + u8 is_add = 1; + u8 tap_if_index_set = 0; + u32 tap_if_index = 0; + clib_error_t *error = NULL; + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "del")) + { + is_add = 0; + } + else if (unformat (line_input, "tap-if-index %d", &tap_if_index)) + tap_if_index_set = 1; + else + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } + } + + if (tap_if_index_set == 0) + { + error = clib_error_return (0, "tap if index not specified"); + goto done; + } + + if (is_add) + { + pem->tap_if_index = tap_if_index; + } + else + { + pem->tap_if_index = ~0; + } + +done: + unformat_free (line_input); + + return error; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (create_pppoe_tap_cmd, static) = +{ + .path = "create pppoe tap", + .short_help = "create pppoe tap if-name <intfc> [del]", + .function = pppoe_add_del_tap_command_fn, +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/pppoe/pppoe_tap_node.c b/src/plugins/pppoe/pppoe_tap_node.c new file mode 100644 index 00000000..f1e0a501 --- /dev/null +++ b/src/plugins/pppoe/pppoe_tap_node.c @@ -0,0 +1,297 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2017 Intel and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or pemplied. + * See the License for the specific language governing permissions and + * lpemitations under the License. + *------------------------------------------------------------------ + */ + +#include <vlib/vlib.h> +#include <vnet/ppp/packet.h> +#include <pppoe/pppoe.h> + +vlib_node_registration_t pppoe_tap_dispatch_node; + +#define foreach_pppoe_tap_next \ +_(DROP, "error-drop") \ +_(TUNTAP, "tuntap-tx" ) \ +_(INTERFACE, "interface-output" ) \ + +typedef enum +{ +#define _(s,n) PPPOE_TAP_NEXT_##s, + foreach_pppoe_tap_next +#undef _ + PPPOE_TAP_N_NEXT, +} pppoe_tap_next_t; + +typedef struct { + u32 next_index; + u32 sw_if_index; + u32 tap_if_index; + u8 pppoe_code; + u16 ppp_proto; + u32 error; +} pppoe_tap_trace_t; + +static u8 * format_pppoe_tap_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + pppoe_tap_trace_t * t = va_arg (*args, pppoe_tap_trace_t *); + pppoe_main_t * pem = &pppoe_main; + + if (t->sw_if_index != pem->tap_if_index) + { + s = format (s, "PPPoE dispatch from sw_if_index %d next %d error %d \n" + " pppoe_code 0x%x ppp_proto 0x%x", + t->sw_if_index, t->next_index, t->error, + t->pppoe_code, t->ppp_proto); + } + else + { + s = format (s, "PPPoE dispatch from tap_if_index %d next %d error %d \n" + " pppoe_code 0x%x ppp_proto 0x%x", + t->tap_if_index, t->next_index, t->error, + t->pppoe_code, t->ppp_proto); + } + return s; +} + +/** + * Perform learning on one packet based on the mac table lookup result. + * */ +static_always_inline void +pppoe_learn_process (vlib_node_runtime_t * node, + pppoe_main_t * pem, + vlib_buffer_t * b0, + u32 sw_if_index0, + pppoe_entry_key_t * key0, + pppoe_entry_key_t * cached_key, + u32 * bucket0, + pppoe_entry_result_t * result0) +{ + /* Check mac table lookup result */ + if (PREDICT_TRUE (result0->fields.sw_if_index == sw_if_index0)) + { + /* + * The entry was in the table, and the sw_if_index matched, the normal case + */ + return; + } + else if (result0->fields.sw_if_index == ~0) + { + /* The entry was not in table, so add it */ + result0->fields.sw_if_index = sw_if_index0; + result0->fields.session_index = ~0; + cached_key->raw = ~0; /* invalidate the cache */ + } + else + { + /* The entry was in the table, but with the wrong sw_if_index mapping (mac move) */ + result0->fields.sw_if_index = sw_if_index0; + } + + /* Update the entry */ + BVT (clib_bihash_kv) kv; + kv.key = key0->raw; + kv.value = result0->raw; + BV (clib_bihash_add_del) (&pem->session_table, &kv, 1 /* is_add */ ); +} + +static uword +pppoe_tap_dispatch (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + u32 n_left_from, next_index, * from, * to_next; + pppoe_main_t * pem = &pppoe_main; + vnet_main_t * vnm = pem->vnet_main; + vnet_interface_main_t * im = &vnm->interface_main; + u32 pkts_decapsulated = 0; + u32 thread_index = vlib_get_thread_index(); + u32 stats_sw_if_index, stats_n_packets, stats_n_bytes; + pppoe_entry_key_t cached_key; + pppoe_entry_result_t cached_result; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + /* Clear the one-entry cache in case session table was updated */ + cached_key.raw = ~0; + cached_result.raw = ~0; /* warning be gone */ + + next_index = node->cached_next_index; + stats_sw_if_index = node->runtime_data[0]; + stats_n_packets = stats_n_bytes = 0; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, + to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t * b0; + ethernet_header_t *h0; + pppoe_header_t * pppoe0; + pppoe_entry_key_t key0; + pppoe_entry_result_t result0; + + u32 bucket0; + u32 next0; + u32 error0 = 0; + u32 rx_sw_if_index0=~0, tx_sw_if_index0=~0, len0; + vnet_hw_interface_t *hi; + vnet_sw_interface_t *si; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + /* leaves current_data pointing at the pppoe header */ + pppoe0 = vlib_buffer_get_current (b0); + rx_sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX]; + + if (PREDICT_FALSE (pppoe0->ver_type != PPPOE_VER_TYPE)) + { + error0 = PPPOE_ERROR_BAD_VER_TYPE; + next0 = PPPOE_INPUT_NEXT_DROP; + goto trace00; + } + + vlib_buffer_reset(b0); + h0 = vlib_buffer_get_current (b0); + + if(rx_sw_if_index0 == pem->tap_if_index) + { + pppoe_lookup_1 (&pem->session_table, &cached_key, &cached_result, + h0->dst_address, 0, + &key0, &bucket0, &result0); + tx_sw_if_index0 = result0.fields.sw_if_index; + + if (PREDICT_FALSE (tx_sw_if_index0 == ~0)) + { + error0 = PPPOE_ERROR_NO_SUCH_SESSION; + next0 = PPPOE_INPUT_NEXT_DROP; + goto trace00; + } + + next0 = PPPOE_TAP_NEXT_INTERFACE; + vnet_buffer(b0)->sw_if_index[VLIB_TX] = tx_sw_if_index0; + + /* set src mac address */ + si = vnet_get_sw_interface(vnm, tx_sw_if_index0); + hi = vnet_get_hw_interface (vnm, si->hw_if_index); + clib_memcpy (vlib_buffer_get_current (b0)+6, hi->hw_address, 6); + } + else + { + pppoe_lookup_1 (&pem->session_table, &cached_key, &cached_result, + h0->src_address, pppoe0->session_id, + &key0, &bucket0, &result0); + tx_sw_if_index0 = result0.fields.sw_if_index; + + /* learn client session */ + pppoe_learn_process (node, pem, b0, rx_sw_if_index0, + &key0, &cached_key, + &bucket0, &result0); + + next0 = PPPOE_TAP_NEXT_TUNTAP; + vnet_buffer(b0)->sw_if_index[VLIB_TX] = pem->tap_if_index; + } + + len0 = vlib_buffer_length_in_chain (vm, b0); + + pkts_decapsulated ++; + stats_n_packets += 1; + stats_n_bytes += len0; + + /* Batch stats increment on the same pppoe session so counter + is not incremented per packet */ + if (PREDICT_FALSE (rx_sw_if_index0 != stats_sw_if_index)) + { + stats_n_packets -= 1; + stats_n_bytes -= len0; + if (stats_n_packets) + vlib_increment_combined_counter + (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, + thread_index, stats_sw_if_index, + stats_n_packets, stats_n_bytes); + stats_n_packets = 1; + stats_n_bytes = len0; + stats_sw_if_index = rx_sw_if_index0; + } + + trace00: + b0->error = error0 ? node->errors[error0] : 0; + + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + pppoe_tap_trace_t *tr + = vlib_add_trace (vm, node, b0, sizeof (*tr)); + tr->next_index = next0; + tr->error = error0; + tr->sw_if_index = tx_sw_if_index0; + tr->tap_if_index = pem->tap_if_index; + tr->pppoe_code = pppoe0->code; + tr->ppp_proto = clib_net_to_host_u16(pppoe0->ppp_proto); + } + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + /* Do we still need this now that session tx stats is kept? */ + vlib_node_increment_counter (vm, pppoe_input_node.index, + PPPOE_ERROR_DECAPSULATED, + pkts_decapsulated); + + /* Increment any remaining batch stats */ + if (stats_n_packets) + { + vlib_increment_combined_counter + (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); + node->runtime_data[0] = stats_sw_if_index; + } + + return from_frame->n_vectors; +} + +VLIB_REGISTER_NODE (pppoe_tap_dispatch_node) = { + .function = pppoe_tap_dispatch, + .name = "pppoe-tap-dispatch", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + + .n_next_nodes = PPPOE_TAP_N_NEXT, + .next_nodes = { +#define _(s,n) [PPPOE_TAP_NEXT_##s] = n, + foreach_pppoe_tap_next +#undef _ + }, + + .format_trace = format_pppoe_tap_trace, +}; + +VLIB_NODE_FUNCTION_MULTIARCH (pppoe_tap_dispatch_node, pppoe_tap_dispatch) + diff --git a/src/plugins/pppoe/pppoe_test.c b/src/plugins/pppoe/pppoe_test.c new file mode 100644 index 00000000..2b67d989 --- /dev/null +++ b/src/plugins/pppoe/pppoe_test.c @@ -0,0 +1,330 @@ +/* + * Copyright (c) 2017 Intel and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vat/vat.h> +#include <vlibapi/api.h> +#include <vlibmemory/api.h> +#include <vlibsocket/api.h> +#include <vppinfra/error.h> +#include <pppoe/pppoe.h> + +#define __plugin_msg_base pppoe_test_main.msg_id_base +#include <vlibapi/vat_helper_macros.h> + + +uword unformat_ip46_address (unformat_input_t * input, va_list * args) +{ + ip46_address_t *ip46 = va_arg (*args, ip46_address_t *); + ip46_type_t type = va_arg (*args, ip46_type_t); + if ((type != IP46_TYPE_IP6) && + unformat(input, "%U", unformat_ip4_address, &ip46->ip4)) { + ip46_address_mask_ip4(ip46); + return 1; + } else if ((type != IP46_TYPE_IP4) && + unformat(input, "%U", unformat_ip6_address, &ip46->ip6)) { + return 1; + } + return 0; +} +uword unformat_ip46_prefix (unformat_input_t * input, va_list * args) +{ + ip46_address_t *ip46 = va_arg (*args, ip46_address_t *); + u8 *len = va_arg (*args, u8 *); + ip46_type_t type = va_arg (*args, ip46_type_t); + + u32 l; + if ((type != IP46_TYPE_IP6) && unformat(input, "%U/%u", unformat_ip4_address, &ip46->ip4, &l)) { + if (l > 32) + return 0; + *len = l + 96; + ip46->pad[0] = ip46->pad[1] = ip46->pad[2] = 0; + } else if ((type != IP46_TYPE_IP4) && unformat(input, "%U/%u", unformat_ip6_address, &ip46->ip6, &l)) { + if (l > 128) + return 0; + *len = l; + } else { + return 0; + } + return 1; +} +///////////////////////// + +#define vl_msg_id(n,h) n, +typedef enum { +#include <pppoe/pppoe.api.h> + /* We'll want to know how many messages IDs we need... */ + VL_MSG_FIRST_AVAILABLE, +} vl_msg_id_t; +#undef vl_msg_id + +/* define message structures */ +#define vl_typedefs +#include <pppoe/pppoe.api.h> +#undef vl_typedefs + +/* declare message handlers for each api */ + +#define vl_endianfun /* define message structures */ +#include <pppoe/pppoe.api.h> +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) +#define vl_printfun +#include <pppoe/pppoe.api.h> +#undef vl_printfun + +/* Get the API version number. */ +#define vl_api_version(n,v) static u32 api_version=(v); +#include <pppoe/pppoe.api.h> +#undef vl_api_version + +typedef struct { + /* API message ID base */ + u16 msg_id_base; + vat_main_t *vat_main; +} pppoe_test_main_t; + +pppoe_test_main_t pppoe_test_main; + +static void vl_api_pppoe_add_del_session_reply_t_handler + (vl_api_pppoe_add_del_session_reply_t * mp) +{ + vat_main_t *vam = &vat_main; + i32 retval = ntohl (mp->retval); + if (vam->async_mode) + { + vam->async_errors += (retval < 0); + } + else + { + vam->retval = retval; + vam->sw_if_index = ntohl (mp->sw_if_index); + vam->result_ready = 1; + } +} + + +/* + * Table of message reply handlers, must include boilerplate handlers + * we just generated + */ +#define foreach_vpe_api_reply_msg \ + _(PPPOE_ADD_DEL_SESSION_REPLY, pppoe_add_del_session_reply) \ + _(PPPOE_SESSION_DETAILS, pppoe_session_details) + + +static int +api_pppoe_add_del_session (vat_main_t * vam) +{ + unformat_input_t *line_input = vam->input; + vl_api_pppoe_add_del_session_t *mp; + u16 session_id = 0; + ip46_address_t client_ip; + u8 is_add = 1; + u8 client_ip_set = 0; + u8 ipv4_set = 0; + u8 ipv6_set = 0; + u32 decap_vrf_id = 0; + u8 client_mac[6] = { 0 }; + u8 client_mac_set = 0; + int ret; + + /* Can't "universally zero init" (={0}) due to GCC bug 53119 */ + memset (&client_ip, 0, sizeof client_ip); + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "del")) + { + is_add = 0; + } + else if (unformat (line_input, "session_id %d", &session_id)) + ; + else if (unformat (line_input, "client-ip %U", + unformat_ip4_address, &client_ip.ip4)) + { + client_ip_set = 1; + ipv4_set = 1; + } + else if (unformat (line_input, "client-ip %U", + unformat_ip6_address, &client_ip.ip6)) + { + client_ip_set = 1; + ipv6_set = 1; + } + else if (unformat (line_input, "decap-vrf-id %d", &decap_vrf_id)) + ; + else if (unformat (line_input, "client-mac %U", unformat_ethernet_address, client_mac)) + client_mac_set = 1; + else + { + return -99; + } + } + + if (client_ip_set == 0) + { + errmsg ("session client_ip address not specified"); + return -99; + } + + if (ipv4_set && ipv6_set) + { + errmsg ("both IPv4 and IPv6 addresses specified"); + return -99; + } + + if (client_mac_set == 0) + { + errmsg("session client mac not specified"); + return -99; + } + + M (PPPOE_ADD_DEL_SESSION, mp); + + if (ipv6_set) + { + clib_memcpy (mp->client_ip, &client_ip.ip6, sizeof (client_ip.ip6)); + } + else + { + clib_memcpy (mp->client_ip, &client_ip.ip4, sizeof (client_ip.ip4)); + } + + mp->decap_vrf_id = ntohl (decap_vrf_id); + mp->session_id = ntohl (session_id); + mp->is_add = is_add; + mp->is_ipv6 = ipv6_set; + memcpy (mp->client_mac, client_mac, 6); + + S (mp); + W (ret); + return ret; +} + +static void vl_api_pppoe_session_details_t_handler + (vl_api_pppoe_session_details_t * mp) +{ + vat_main_t *vam = &vat_main; + ip46_address_t client_ip = to_ip46 (mp->is_ipv6, mp->client_ip); + + print (vam->ofp, "%11d%14d%24U%14d%14d%30U%30U", + ntohl (mp->sw_if_index), ntohl (mp->session_id), + format_ip46_address, &client_ip, IP46_TYPE_ANY, + ntohl (mp->encap_if_index), ntohl (mp->decap_vrf_id), + format_ethernet_address, mp->local_mac, + format_ethernet_address, mp->client_mac); +} + +static int +api_pppoe_session_dump (vat_main_t * vam) +{ + unformat_input_t *i = vam->input; + vl_api_pppoe_session_dump_t *mp; + u32 sw_if_index; + u8 sw_if_index_set = 0; + int ret; + + /* Parse args required to build the message */ + while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT) + { + if (unformat (i, "sw_if_index %d", &sw_if_index)) + sw_if_index_set = 1; + else + break; + } + + if (sw_if_index_set == 0) + { + sw_if_index = ~0; + } + + if (!vam->json_output) + { + print (vam->ofp, "%11s%24s%14s%14s%14s", + "sw_if_index", "client_ip", "session_id", + "encap_if_index", "decap_fib_index", + "local-mac", "client-mac"); + } + + /* Get list of pppoe-session interfaces */ + M (PPPOE_SESSION_DUMP, mp); + + mp->sw_if_index = htonl (sw_if_index); + + S (mp); + + W (ret); + return ret; +} + +/* + * List of messages that the api test plugin sends, + * and that the data plane plugin processes + */ +#define foreach_vpe_api_msg \ +_(pppoe_add_del_session, \ + " client-addr <client-addr> session-id <nn>" \ + " [encap-if-index <nn>] [decap-next [ip4|ip6|node <name>]]" \ + " local-mac <local-mac> client-mac <client-mac> [del]") \ +_(pppoe_session_dump, "[<intfc> | sw_if_index <nn>]") \ + +static void +pppoe_vat_api_hookup (vat_main_t *vam) +{ + pppoe_test_main_t * pem = &pppoe_test_main; + /* Hook up handlers for replies from the data plane plug-in */ +#define _(N,n) \ + vl_msg_api_set_handlers((VL_API_##N + pem->msg_id_base), \ + #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_vpe_api_reply_msg; +#undef _ + + /* API messages we can send */ +#define _(n,h) hash_set_mem (vam->function_by_name, #n, api_##n); + foreach_vpe_api_msg; +#undef _ + + /* Help strings */ +#define _(n,h) hash_set_mem (vam->help_by_name, #n, h); + foreach_vpe_api_msg; +#undef _ +} + +clib_error_t * vat_plugin_register (vat_main_t *vam) +{ + pppoe_test_main_t * pem = &pppoe_test_main; + + u8 * name; + + pem->vat_main = vam; + + /* Ask the vpp engine for the first assigned message-id */ + name = format (0, "pppoe_%08x%c", api_version, 0); + pem->msg_id_base = vl_client_get_first_plugin_msg_id ((char *) name); + + if (pem->msg_id_base != (u16) ~0) + pppoe_vat_api_hookup (vam); + + vec_free(name); + + return 0; +} diff --git a/src/plugins/sixrd.am b/src/plugins/sixrd.am new file mode 100644 index 00000000..0de45088 --- /dev/null +++ b/src/plugins/sixrd.am @@ -0,0 +1,26 @@ +# Copyright (c) 2015 Cisco and/or its affiliates. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +libsixrd_plugin_la_SOURCES = \ + sixrd/sixrd.c \ + sixrd/sixrd_dpo.c \ + sixrd/ip4_sixrd.c \ + sixrd/ip6_sixrd.c + +noinst_HEADERS += \ + sixrd/sixrd.h \ + sixrd/sixrd_dpo.h + +vppplugins_LTLIBRARIES += libsixrd_plugin.la + +# vi:syntax=automake diff --git a/src/plugins/sixrd/ip4_sixrd.c b/src/plugins/sixrd/ip4_sixrd.c new file mode 100644 index 00000000..2fb8015d --- /dev/null +++ b/src/plugins/sixrd/ip4_sixrd.c @@ -0,0 +1,127 @@ +/*--------------------------------------------------------------------------- + * Copyright (c) 2009-2014 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *--------------------------------------------------------------------------- + */ +#include "sixrd.h" + +static vlib_node_registration_t ip4_sixrd_node; + +typedef enum { + IP4_SIXRD_NEXT_IP6_LOOKUP, + IP4_SIXRD_NEXT_DROP, + IP4_SIXRD_N_NEXT, +} ip4_sixrd_next_t; + +/* + * ip4_sixrd_sec_check + */ +static_always_inline void +ip4_sixrd_sec_check (sixrd_domain_t *d, ip4_address_t sa4, ip6_address_t sa6, u8 *error) +{ + u32 a = sixrd_get_addr(d, sa6.as_u64[0]); + clib_warning("Security check: %U %U", format_ip4_address, &a, format_ip4_address, &sa4); + if (PREDICT_FALSE(sixrd_get_addr(d, sa6.as_u64[0]) != sa4.as_u32)) + *error = SIXRD_ERROR_SEC_CHECK; +} + +/* + * ip4_sixrd + */ +static uword +ip4_sixrd (vlib_main_t *vm, + vlib_node_runtime_t *node, + vlib_frame_t *frame) +{ + u32 n_left_from, *from, next_index, *to_next, n_left_to_next; + vlib_node_runtime_t *error_node = vlib_node_get_runtime(vm, ip4_sixrd_node.index); + u32 decap = 0; + + from = vlib_frame_vector_args(frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + while (n_left_from > 0) { + vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next); + + /* Single loop */ + while (n_left_from > 0 && n_left_to_next > 0) { + u32 pi0; + vlib_buffer_t *p0; + u8 error0 = SIXRD_ERROR_NONE; + sixrd_domain_t *d0 = 0; + ip4_header_t *ip40; + ip6_header_t *ip60; + u32 sixrd_domain_index0 = ~0; + u32 next0; + + pi0 = to_next[0] = from[0]; + from += 1; + n_left_from -= 1; + to_next +=1; + n_left_to_next -= 1; + + p0 = vlib_get_buffer(vm, pi0); + ip40 = vlib_buffer_get_current(p0); + + /* Throw away anything that isn't IP in IP. */ + if (PREDICT_TRUE(ip40->protocol == IP_PROTOCOL_IPV6 && clib_net_to_host_u16(ip40->length) >= 60)) { + vlib_buffer_advance(p0, sizeof(ip4_header_t)); + ip60 = vlib_buffer_get_current(p0); + d0 = ip4_sixrd_get_domain(vnet_buffer(p0)->ip.adj_index[VLIB_TX], (ip6_address_t *)&ip60->src_address, + &sixrd_domain_index0, &error0); + } else { + error0 = SIXRD_ERROR_BAD_PROTOCOL; + } + if (d0) { + /* SIXRD inbound security check */ + ip4_sixrd_sec_check(d0, ip40->src_address, ip60->src_address, &error0); + } + + next0 = error0 == SIXRD_ERROR_NONE ? IP4_SIXRD_NEXT_IP6_LOOKUP : IP4_SIXRD_NEXT_DROP; + + if (PREDICT_FALSE(p0->flags & VLIB_BUFFER_IS_TRACED)) { + sixrd_trace_t *tr = vlib_add_trace(vm, node, p0, sizeof(*tr)); + tr->sixrd_domain_index = sixrd_domain_index0; + } + + p0->error = error_node->errors[error0]; + if (PREDICT_TRUE(error0 == SIXRD_ERROR_NONE)) decap++; + vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next, n_left_to_next, pi0, next0); + + } + vlib_put_next_frame(vm, node, next_index, n_left_to_next); + } + vlib_node_increment_counter(vm, ip4_sixrd_node.index, SIXRD_ERROR_DECAPSULATED, decap); + + return frame->n_vectors; +} + +static char *sixrd_error_strings[] = { +#define _(sym,string) string, + foreach_sixrd_error +#undef _ +}; + +VLIB_REGISTER_NODE(ip4_sixrd_node,static) = { + .function = ip4_sixrd, + .name = "ip4-sixrd", + .vector_size = sizeof(u32), + .format_trace = format_sixrd_trace, + .n_errors = SIXRD_N_ERROR, + .error_strings = sixrd_error_strings, + .n_next_nodes = IP4_SIXRD_N_NEXT, + .next_nodes = { + [IP4_SIXRD_NEXT_IP6_LOOKUP] = "ip6-lookup", + [IP4_SIXRD_NEXT_DROP] = "error-drop", + }, +}; diff --git a/src/plugins/sixrd/ip6_sixrd.c b/src/plugins/sixrd/ip6_sixrd.c new file mode 100644 index 00000000..36f3fab3 --- /dev/null +++ b/src/plugins/sixrd/ip6_sixrd.c @@ -0,0 +1,129 @@ +/*--------------------------------------------------------------------------- + * Copyright (c) 2009-2014 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *--------------------------------------------------------------------------- + */ +/* + * Defines used for testing various optimisation schemes + */ +#define SIXRD_ENCAP_DUAL 0 + +#include "sixrd.h" + +static vlib_node_registration_t ip6_sixrd_node; + +typedef enum { + IP6_SIXRD_NEXT_IP4_LOOKUP, + IP6_SIXRD_NEXT_DROP, + IP6_SIXRD_N_NEXT, +} ip6_sixrd_next_t; + +/* + * ip6_sixrd + */ +static uword +ip6_sixrd (vlib_main_t *vm, + vlib_node_runtime_t *node, + vlib_frame_t *frame) +{ + u32 n_left_from, *from, next_index, *to_next, n_left_to_next; + vlib_node_runtime_t *error_node = vlib_node_get_runtime(vm, ip6_sixrd_node.index); + u32 encap = 0; + from = vlib_frame_vector_args(frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + while (n_left_from > 0) { + vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) { + u32 pi0; + vlib_buffer_t *p0; + sixrd_domain_t *d0; + u8 error0 = SIXRD_ERROR_NONE; + ip6_header_t *ip60; + ip4_header_t *ip4h0; + u32 next0 = IP6_SIXRD_NEXT_IP4_LOOKUP; + u32 sixrd_domain_index0 = ~0; + + pi0 = to_next[0] = from[0]; + from += 1; + n_left_from -= 1; + to_next +=1; + n_left_to_next -= 1; + + p0 = vlib_get_buffer(vm, pi0); + ip60 = vlib_buffer_get_current(p0); + // p0->current_length = clib_net_to_host_u16(ip40->length); + d0 = ip6_sixrd_get_domain(vnet_buffer(p0)->ip.adj_index[VLIB_TX], &sixrd_domain_index0); + ASSERT(d0); + + /* SIXRD calc */ + u64 dal60 = clib_net_to_host_u64(ip60->dst_address.as_u64[0]); + u32 da40 = sixrd_get_addr(d0, dal60); + u16 len = clib_net_to_host_u16(ip60->payload_length) + 60; + if (da40 == 0) error0 = SIXRD_ERROR_UNKNOWN; + + /* construct ipv4 header */ + vlib_buffer_advance(p0, - (sizeof(ip4_header_t))); + ip4h0 = vlib_buffer_get_current(p0); + vnet_buffer(p0)->sw_if_index[VLIB_TX] = (u32)~0; + ip4h0->ip_version_and_header_length = 0x45; + ip4h0->tos = 0; + ip4h0->length = clib_host_to_net_u16(len); + ip4h0->fragment_id = 0; + ip4h0->flags_and_fragment_offset = 0; + ip4h0->ttl = 0x40; + ip4h0->protocol = IP_PROTOCOL_IPV6; + ip4h0->src_address = d0->ip4_src; + ip4h0->dst_address.as_u32 = clib_host_to_net_u32(da40); + ip4h0->checksum = ip4_header_checksum(ip4h0); + + next0 = error0 == SIXRD_ERROR_NONE ? IP6_SIXRD_NEXT_IP4_LOOKUP : IP6_SIXRD_NEXT_DROP; + + if (PREDICT_FALSE(p0->flags & VLIB_BUFFER_IS_TRACED)) { + sixrd_trace_t *tr = vlib_add_trace(vm, node, p0, sizeof(*tr)); + tr->sixrd_domain_index = sixrd_domain_index0; + } + + p0->error = error_node->errors[error0]; + if (PREDICT_TRUE(error0 == SIXRD_ERROR_NONE)) encap++; + + vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next, n_left_to_next, pi0, next0); + } + vlib_put_next_frame(vm, node, next_index, n_left_to_next); + } + vlib_node_increment_counter(vm, ip6_sixrd_node.index, SIXRD_ERROR_ENCAPSULATED, encap); + + return frame->n_vectors; +} + +static char *sixrd_error_strings[] = { +#define _(sym,string) string, + foreach_sixrd_error +#undef _ +}; + +VLIB_REGISTER_NODE(ip6_sixrd_node,static) = { + .function = ip6_sixrd, + .name = "ip6-sixrd", + .vector_size = sizeof(u32), + .format_trace = format_sixrd_trace, + .n_errors = SIXRD_N_ERROR, + .error_strings = sixrd_error_strings, + .n_next_nodes = IP6_SIXRD_N_NEXT, + .next_nodes = { + [IP6_SIXRD_NEXT_IP4_LOOKUP] = "ip4-lookup", + [IP6_SIXRD_NEXT_DROP] = "error-drop", + }, +}; diff --git a/src/plugins/sixrd/sixrd.c b/src/plugins/sixrd/sixrd.c new file mode 100644 index 00000000..98387525 --- /dev/null +++ b/src/plugins/sixrd/sixrd.c @@ -0,0 +1,377 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "sixrd.h" +#include <vnet/plugin/plugin.h> + +#include <vnet/fib/fib_table.h> +#include <vnet/fib/ip6_fib.h> +#include <vnet/adj/adj.h> +#include <vpp/app/version.h> + +/* + * This code supports the following sixrd modes: + * + * 32 EA bits (Complete IPv4 address is embedded): + * ea_bits_len = 32 + * IPv4 suffix is embedded: + * ea_bits_len = < 32 + * No embedded address bits (1:1 mode): + * ea_bits_len = 0 + */ + +int +sixrd_create_domain (ip6_address_t *ip6_prefix, + u8 ip6_prefix_len, + ip4_address_t *ip4_prefix, + u8 ip4_prefix_len, + ip4_address_t *ip4_src, + u32 *sixrd_domain_index, + u16 mtu) +{ + dpo_id_t dpo_v6 = DPO_INVALID, dpo_v4 = DPO_INVALID; + sixrd_main_t *mm = &sixrd_main; + fib_node_index_t fei; + sixrd_domain_t *d; + + /* Get domain index */ + pool_get_aligned(mm->domains, d, CLIB_CACHE_LINE_BYTES); + memset(d, 0, sizeof (*d)); + *sixrd_domain_index = d - mm->domains; + + /* Init domain struct */ + d->ip4_prefix.as_u32 = ip4_prefix->as_u32; + d->ip4_prefix_len = ip4_prefix_len; + d->ip6_prefix = *ip6_prefix; + d->ip6_prefix_len = ip6_prefix_len; + d->ip4_src = *ip4_src; + d->mtu = mtu; + + if (ip4_prefix_len < 32) + d->shift = 64 - ip6_prefix_len + (32 - ip4_prefix_len); + + /* Create IPv6 route/adjacency */ + fib_prefix_t pfx6 = { + .fp_proto = FIB_PROTOCOL_IP6, + .fp_len = d->ip6_prefix_len, + .fp_addr = { + .ip6 = d->ip6_prefix, + }, + }; + sixrd_dpo_create(DPO_PROTO_IP6, + *sixrd_domain_index, + &dpo_v6); + fib_table_entry_special_dpo_add(0, &pfx6, + FIB_SOURCE_SIXRD, + FIB_ENTRY_FLAG_EXCLUSIVE, + &dpo_v6); + dpo_reset (&dpo_v6); + + /* + * Multiple SIXRD domains may share same source IPv4 TEP + * In this case the route will exist and be SixRD sourced. + * Find the adj (if any) already contributed and modify it + */ + fib_prefix_t pfx4 = { + .fp_proto = FIB_PROTOCOL_IP4, + .fp_len = 32, + .fp_addr = { + .ip4 = d->ip4_src, + }, + }; + fei = fib_table_lookup_exact_match(0, &pfx4); + + if (FIB_NODE_INDEX_INVALID != fei) + { + dpo_id_t dpo = DPO_INVALID; + + if (fib_entry_get_dpo_for_source (fei, FIB_SOURCE_SIXRD, &dpo)) + { + /* + * modify the existing adj to indicate it's shared + * skip to route add. + * It is locked to pair with the unlock below. + */ + const dpo_id_t *sd_dpo; + sixrd_dpo_t *sd; + + ASSERT(DPO_LOAD_BALANCE == dpo.dpoi_type); + + sd_dpo = load_balance_get_bucket(dpo.dpoi_index, 0); + sd = sixrd_dpo_get (sd_dpo->dpoi_index); + + sd->sd_domain = ~0; + dpo_copy (&dpo_v4, sd_dpo); + dpo_reset (&dpo); + + goto route_add; + } + } + /* first time addition of the route */ + sixrd_dpo_create(DPO_PROTO_IP4, + *sixrd_domain_index, + &dpo_v4); + +route_add: + /* + * Create ip4 route. This is a reference counted add. If the prefix + * already exists and is SixRD sourced, it is now SixRD source n+1 times + * and will need to be removed n+1 times. + */ + fib_table_entry_special_dpo_add(0, &pfx4, + FIB_SOURCE_SIXRD, + FIB_ENTRY_FLAG_EXCLUSIVE, + &dpo_v4); + dpo_reset (&dpo_v4); + + return 0; +} + +/* + * sixrd_delete_domain + */ +int +sixrd_delete_domain (u32 sixrd_domain_index) +{ + sixrd_main_t *mm = &sixrd_main; + sixrd_domain_t *d; + + if (pool_is_free_index(mm->domains, sixrd_domain_index)) { + clib_warning("SIXRD domain delete: domain does not exist: %d", + sixrd_domain_index); + return -1; + } + + d = pool_elt_at_index(mm->domains, sixrd_domain_index); + + fib_prefix_t pfx = { + .fp_proto = FIB_PROTOCOL_IP4, + .fp_len = 32, + .fp_addr = { + .ip4 = d->ip4_src, + }, + }; + fib_table_entry_special_remove(0, &pfx, FIB_SOURCE_SIXRD); + + fib_prefix_t pfx6 = { + .fp_proto = FIB_PROTOCOL_IP6, + .fp_len = d->ip6_prefix_len, + .fp_addr = { + .ip6 = d->ip6_prefix, + }, + }; + fib_table_entry_special_remove(0, &pfx6, FIB_SOURCE_SIXRD); + + pool_put(mm->domains, d); + + return 0; +} + +static clib_error_t * +sixrd_add_domain_command_fn (vlib_main_t *vm, + unformat_input_t *input, + vlib_cli_command_t *cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + ip4_address_t ip4_prefix; + ip6_address_t ip6_prefix; + ip4_address_t ip4_src; + u32 ip6_prefix_len=0, ip4_prefix_len=0, sixrd_domain_index; + u32 num_m_args = 0; + /* Optional arguments */ + u32 mtu = 0; + clib_error_t *error = 0; + + /* Get a line of input. */ + if (!unformat_user(input, unformat_line_input, line_input)) + return 0; + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) { + if (unformat(line_input, "ip6-pfx %U/%d", unformat_ip6_address, &ip6_prefix, &ip6_prefix_len)) + num_m_args++; + else if (unformat(line_input, "ip4-pfx %U/%d", unformat_ip4_address, &ip4_prefix, &ip4_prefix_len)) + num_m_args++; + else if (unformat(line_input, "ip4-src %U", unformat_ip4_address, &ip4_src)) + num_m_args++; + else if (unformat(line_input, "mtu %d", &mtu)) + num_m_args++; + else { + error = clib_error_return(0, "unknown input `%U'", + format_unformat_error, line_input); + goto done; + } + } + + if (num_m_args < 3) { + error = clib_error_return(0, "mandatory argument(s) missing"); + goto done; + } + + sixrd_create_domain(&ip6_prefix, ip6_prefix_len, &ip4_prefix, ip4_prefix_len, + &ip4_src, &sixrd_domain_index, mtu); + +done: + unformat_free (line_input); + + return error; +} + +static clib_error_t * +sixrd_del_domain_command_fn (vlib_main_t *vm, + unformat_input_t *input, + vlib_cli_command_t *cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + u32 num_m_args = 0; + u32 sixrd_domain_index; + clib_error_t *error = 0; + + /* Get a line of input. */ + if (! unformat_user(input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input(line_input) != UNFORMAT_END_OF_INPUT) { + if (unformat(line_input, "index %d", &sixrd_domain_index)) + num_m_args++; + else { + error = clib_error_return(0, "unknown input `%U'", + format_unformat_error, line_input); + goto done; + } + } + + if (num_m_args != 1) { + error = clib_error_return(0, "mandatory argument(s) missing"); + goto done; + } + + sixrd_delete_domain(sixrd_domain_index); + +done: + unformat_free (line_input); + + return error; +} + +static u8 * +format_sixrd_domain (u8 *s, va_list *args) +{ + sixrd_domain_t *d = va_arg(*args, sixrd_domain_t *); + sixrd_main_t *mm = &sixrd_main; + + s = format(s, + "[%d] ip6-pfx %U/%d ip4-pfx %U/%d ip4-src %U mtu %d", + d - mm->domains, + format_ip6_address, &d->ip6_prefix, d->ip6_prefix_len, + format_ip4_address, &d->ip4_prefix, d->ip4_prefix_len, + format_ip4_address, &d->ip4_src, d->mtu); + + return s; +} + +static clib_error_t * +show_sixrd_domain_command_fn (vlib_main_t *vm, unformat_input_t *input, vlib_cli_command_t *cmd) +{ + sixrd_main_t *mm = &sixrd_main; + sixrd_domain_t *d; + + if (pool_elts(mm->domains) == 0) + vlib_cli_output(vm, "No SIXRD domains are configured..."); + + pool_foreach(d, mm->domains, ({vlib_cli_output(vm, "%U", format_sixrd_domain, d);})); + + return 0; + +} + +static clib_error_t * +show_sixrd_stats_command_fn (vlib_main_t *vm, unformat_input_t *input, vlib_cli_command_t *cmd) +{ + sixrd_main_t *mm = &sixrd_main; + sixrd_domain_t *d; + int domains = 0, domaincount = 0; + if (pool_elts (mm->domains) == 0) + vlib_cli_output (vm, "No SIXRD domains are configured..."); + + pool_foreach(d, mm->domains, ({ + domains += sizeof(*d); + domaincount++; + })); + + vlib_cli_output(vm, "SIXRD domains structure: %d\n", sizeof (sixrd_domain_t)); + vlib_cli_output(vm, "SIXRD domains: %d (%d bytes)\n", domaincount, domains); + + return 0; +} + +/* + * packet trace format function + */ +u8 * +format_sixrd_trace (u8 *s, va_list *args) +{ + CLIB_UNUSED(vlib_main_t *vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED(vlib_node_t *node) = va_arg (*args, vlib_node_t *); + sixrd_trace_t *t = va_arg (*args, sixrd_trace_t *); + u32 sixrd_domain_index = t->sixrd_domain_index; + + s = format(s, "SIXRD domain index: %d", sixrd_domain_index); + + return s; +} + +VLIB_CLI_COMMAND(sixrd_add_domain_command, static) = { + .path = "sixrd add domain", + .short_help = + "sixrd add domain ip6-pfx <ip6-pfx> ip4-pfx <ip4-pfx> ip4-src <ip4-addr>", + .function = sixrd_add_domain_command_fn, +}; + +VLIB_CLI_COMMAND(sixrd_del_command, static) = { + .path = "sixrd del domain", + .short_help = + "sixrd del domain index <domain>", + .function = sixrd_del_domain_command_fn, +}; + +VLIB_CLI_COMMAND(show_sixrd_domain_command, static) = { + .path = "show sixrd domain", + .function = show_sixrd_domain_command_fn, +}; + +VLIB_CLI_COMMAND(show_sixrd_stats_command, static) = { + .path = "show sixrd stats", + .function = show_sixrd_stats_command_fn, +}; + +/* *INDENT-OFF* */ +VLIB_PLUGIN_REGISTER () ={ + .version = VPP_BUILD_VER, + .description = "IPv6 Rapid Deployment on IPv4 Infrastructure (RFC5969)", +}; +/* *INDENT-ON* */ + +static clib_error_t * sixrd_init (vlib_main_t * vm) +{ + sixrd_main_t *mm = &sixrd_main; + + mm->vnet_main = vnet_get_main(); + mm->vlib_main = vm; + + sixrd_dpo_module_init (); + + return (NULL); +} + +VLIB_INIT_FUNCTION (sixrd_init); diff --git a/src/plugins/sixrd/sixrd.h b/src/plugins/sixrd/sixrd.h new file mode 100644 index 00000000..56714c9e --- /dev/null +++ b/src/plugins/sixrd/sixrd.h @@ -0,0 +1,141 @@ +/*--------------------------------------------------------------------------- + * Copyright (c) 2009-2014 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *--------------------------------------------------------------------------- + */ +#include <stdbool.h> +#include <vppinfra/error.h> +#include <vnet/vnet.h> +#include <vnet/ip/ip.h> +#include <vnet/fib/ip6_fib.h> + +#include "sixrd_dpo.h" + +int sixrd_create_domain(ip6_address_t *ip6_prefix, u8 ip6_prefix_len, + ip4_address_t *ip4_prefix, u8 ip4_prefix_len, + ip4_address_t *ip4_src, u32 *sixrd_domain_index, u16 mtu); +int sixrd_delete_domain(u32 sixrd_domain_index); +u8 *format_sixrd_trace(u8 *s, va_list *args); + +typedef struct { + ip6_address_t ip6_prefix; + ip4_address_t ip4_prefix; + ip4_address_t ip4_src; + u8 ip6_prefix_len; + u8 ip4_prefix_len; + + /* helpers */ + u8 shift; + + u16 mtu; +} sixrd_domain_t; + +typedef struct { + /* pool of SIXRD domains */ + sixrd_domain_t *domains; + + /* convenience */ + vlib_main_t *vlib_main; + vnet_main_t *vnet_main; +} sixrd_main_t; + +#define foreach_sixrd_error \ + /* Must be first. */ \ + _(NONE, "valid SIXRD packets") \ + _(BAD_PROTOCOL, "bad protocol") \ + _(WRONG_ICMP_TYPE, "wrong icmp type") \ + _(SEC_CHECK, "security check failed") \ + _(ICMP, "unable to translate ICMP") \ + _(UNKNOWN, "unknown") \ + _(NO_DOMAIN, "no domain") \ + _(ENCAPSULATED, "encapsulated") \ + _(DECAPSULATED, "decapsulated") \ + _(TRANSLATED_4TO6, "translated 4 to 6") \ + _(TRANSLATED_6TO4, "translated 6 to 4") \ + _(FRAGMENT, "fragment handling error") \ + _(FRAGMENT_QUEUED, "dropped, missing first fragment") \ + _(FRAGMENTED, "packets requiring fragmentation") \ + _(FRAGMENT_PARTS, "fragment parts") \ + _(MALFORMED, "malformed packet") + +typedef enum { +#define _(sym,str) SIXRD_ERROR_##sym, + foreach_sixrd_error +#undef _ + SIXRD_N_ERROR, + } sixrd_error_t; + +typedef struct { + u32 sixrd_domain_index; +} sixrd_trace_t; + +sixrd_main_t sixrd_main; + +/* + * sixrd_get_addr + */ +static_always_inline u32 +sixrd_get_addr (sixrd_domain_t *d, u64 dal) +{ + + /* 1:1 mode */ + if (d->ip4_prefix_len == 32) return (d->ip4_prefix.as_u32); + + /* Grab 32 - ip4_prefix_len bits out of IPv6 address from offset ip6_prefix_len */ + return (d->ip4_prefix.as_u32 | (u32)(dal >> d->shift)); +} + +/* + * Get the SIXRD domain from an IPv6 lookup adjacency. + */ +static_always_inline sixrd_domain_t * +ip6_sixrd_get_domain (u32 sdi, u32 *sixrd_domain_index) +{ + sixrd_main_t *mm = &sixrd_main; + sixrd_dpo_t *sd; + + sd = sixrd_dpo_get(sdi); + + ASSERT(sd); + *sixrd_domain_index = sd->sd_domain; + return pool_elt_at_index(mm->domains, *sixrd_domain_index); +} + +/* + * Get the SIXRD domain from an IPv4 lookup adjacency. + * If the IPv4 address is not shared, no lookup is required. + * The IPv6 address is used otherwise. + */ +static_always_inline sixrd_domain_t * +ip4_sixrd_get_domain (u32 sdi, ip6_address_t *addr, + u32 *sixrd_domain_index, u8 *error) +{ + sixrd_main_t *mm = &sixrd_main; + sixrd_dpo_t *sd; + + sd = sixrd_dpo_get(sdi); + *sixrd_domain_index = sd->sd_domain; + if (*sixrd_domain_index != ~0) + return pool_elt_at_index(mm->domains, *sixrd_domain_index); + + u32 lbi = ip6_fib_table_fwding_lookup(&ip6_main, 0, addr); + const dpo_id_t *dpo = load_balance_get_bucket(lbi, 0); + if (PREDICT_TRUE(dpo->dpoi_type == sixrd_dpo_type)) + { + sd = sixrd_dpo_get(dpo->dpoi_index); + *sixrd_domain_index = sd->sd_domain; + return pool_elt_at_index(mm->domains, *sixrd_domain_index); + } + *error = SIXRD_ERROR_NO_DOMAIN; + return NULL; +} diff --git a/src/plugins/sixrd/sixrd_dpo.c b/src/plugins/sixrd/sixrd_dpo.c new file mode 100644 index 00000000..88a07935 --- /dev/null +++ b/src/plugins/sixrd/sixrd_dpo.c @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "sixrd_dpo.h" +#include <vnet/ip/ip.h> + +/** + * pool of all MPLS Label DPOs + */ +sixrd_dpo_t *sixrd_dpo_pool; + +/** + * The register SIXRD DPO type + */ +dpo_type_t sixrd_dpo_type; + +static sixrd_dpo_t * +sixrd_dpo_alloc (void) +{ + sixrd_dpo_t *sd; + + pool_get_aligned(sixrd_dpo_pool, sd, CLIB_CACHE_LINE_BYTES); + memset(sd, 0, sizeof(*sd)); + + return (sd); +} + +static index_t +sixrd_dpo_get_index (sixrd_dpo_t *sd) +{ + return (sd - sixrd_dpo_pool); +} + +void +sixrd_dpo_create (dpo_proto_t dproto, + u32 domain_index, + dpo_id_t *dpo) +{ + sixrd_dpo_t *sd; + + sd = sixrd_dpo_alloc(); + sd->sd_domain = domain_index; + sd->sd_proto = dproto; + + dpo_set(dpo, + sixrd_dpo_type, + dproto, + sixrd_dpo_get_index(sd)); +} + +u8* +format_sixrd_dpo (u8 *s, va_list *args) +{ + index_t index = va_arg (*args, index_t); + CLIB_UNUSED(u32 indent) = va_arg (*args, u32); + sixrd_dpo_t *sd; + + sd = sixrd_dpo_get(index); + + return (format(s, "sixrd:[%d]:%U domain:%d", + index, + format_dpo_proto, sd->sd_proto, + sd->sd_domain)); +} + + +static void +sixrd_dpo_lock (dpo_id_t *dpo) +{ + sixrd_dpo_t *sd; + + sd = sixrd_dpo_get(dpo->dpoi_index); + + sd->sd_locks++; +} + +static void +sixrd_dpo_unlock (dpo_id_t *dpo) +{ + sixrd_dpo_t *sd; + + sd = sixrd_dpo_get(dpo->dpoi_index); + + sd->sd_locks--; + + if (0 == sd->sd_locks) + { + pool_put(sixrd_dpo_pool, sd); + } +} + +const static dpo_vft_t sd_vft = { + .dv_lock = sixrd_dpo_lock, + .dv_unlock = sixrd_dpo_unlock, + .dv_format = format_sixrd_dpo, +}; + +const static char* const sixrd_ip4_nodes[] = +{ + "ip4-sixrd", + NULL, +}; +const static char* const sixrd_ip6_nodes[] = +{ + "ip6-sixrd", + NULL, +}; + +const static char* const * const sixrd_nodes[DPO_PROTO_NUM] = +{ + [DPO_PROTO_IP4] = sixrd_ip4_nodes, + [DPO_PROTO_IP6] = sixrd_ip6_nodes, + [DPO_PROTO_MPLS] = NULL, +}; + +void +sixrd_dpo_module_init (void) +{ + sixrd_dpo_type = dpo_register_new_type(&sd_vft, sixrd_nodes); +} diff --git a/src/plugins/sixrd/sixrd_dpo.h b/src/plugins/sixrd/sixrd_dpo.h new file mode 100644 index 00000000..17142288 --- /dev/null +++ b/src/plugins/sixrd/sixrd_dpo.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __SIXRD_DPO_H__ +#define __SIXRD_DPO_H__ + +#include <vnet/vnet.h> +#include <vnet/dpo/dpo.h> + +/** + * A representation of a 6RD DPO + */ +typedef struct sixrd_dpo_t +{ + /** + * The dat-plane protocol + */ + dpo_proto_t sd_proto; + + /** + * the SIXRD domain index + */ + u32 sd_domain; + + /** + * Number of locks/users of the label + */ + u16 sd_locks; +} sixrd_dpo_t; + +extern void sixrd_dpo_create (dpo_proto_t dproto, + u32 domain_index, + dpo_id_t *dpo); + +/* + * Encapsulation violation for fast data-path access + */ +extern sixrd_dpo_t *sixrd_dpo_pool; +extern dpo_type_t sixrd_dpo_type; + +static inline sixrd_dpo_t * +sixrd_dpo_get (index_t index) +{ + return (pool_elt_at_index(sixrd_dpo_pool, index)); +} + +extern void sixrd_dpo_module_init(void); + +#endif |