diff options
author | Neale Ranns <neale@graphiant.com> | 2020-12-21 08:29:34 +0000 |
---|---|---|
committer | Ole Tr�an <otroan@employees.org> | 2021-02-15 17:27:48 +0000 |
commit | 8f5fef2c78b95de1a636ce27111722b71702212a (patch) | |
tree | a0ebd0189969ccae1f0bdd7c1a9c18dd7a066f2e /src/vnet | |
parent | 54be0cc044f445853fae7b8995c477605250af16 (diff) |
ip: Path MTU
Type: feature
Support setting the MTU for a peer on an interface. The minimum value of
the path and interface MTU is used at forwarding time.
the path MTU is specified for a given peer, by address and table-ID.
In the forwarding plane the MTU is enfored either:
1 - if the peer is attached, then the MTU is set on the peer's
adjacency
2 - if the peer is not attached, it is remote, then a DPO is added to
the peer's FIB entry to perform the necessary fragmentation.
Signed-off-by: Neale Ranns <neale@graphiant.com>
Change-Id: I8b9ea6a07868b50e97e2561f18d9335407dea7ae
Diffstat (limited to 'src/vnet')
-rw-r--r-- | src/vnet/CMakeLists.txt | 3 | ||||
-rw-r--r-- | src/vnet/adj/adj.c | 24 | ||||
-rw-r--r-- | src/vnet/adj/adj.h | 3 | ||||
-rw-r--r-- | src/vnet/adj/adj_glean.c | 1 | ||||
-rw-r--r-- | src/vnet/adj/adj_internal.h | 1 | ||||
-rw-r--r-- | src/vnet/adj/adj_mcast.c | 2 | ||||
-rw-r--r-- | src/vnet/adj/adj_midchain.c | 1 | ||||
-rw-r--r-- | src/vnet/adj/adj_nbr.c | 25 | ||||
-rw-r--r-- | src/vnet/adj/adj_nbr.h | 7 | ||||
-rw-r--r-- | src/vnet/dpo/dpo.c | 16 | ||||
-rw-r--r-- | src/vnet/dpo/dpo.h | 23 | ||||
-rw-r--r-- | src/vnet/dpo/load_balance.c | 23 | ||||
-rw-r--r-- | src/vnet/dpo/mpls_label_dpo.c | 17 | ||||
-rw-r--r-- | src/vnet/fib/fib_entry.c | 20 | ||||
-rw-r--r-- | src/vnet/fib/fib_node.h | 6 | ||||
-rw-r--r-- | src/vnet/fib/fib_path.c | 1 | ||||
-rw-r--r-- | src/vnet/gre/gre.c | 9 | ||||
-rw-r--r-- | src/vnet/ip/ip.api | 57 | ||||
-rw-r--r-- | src/vnet/ip/ip_api.c | 92 | ||||
-rw-r--r-- | src/vnet/ip/ip_path_mtu.c | 883 | ||||
-rw-r--r-- | src/vnet/ip/ip_path_mtu.h | 126 | ||||
-rw-r--r-- | src/vnet/ip/ip_path_mtu_node.c | 206 |
22 files changed, 1528 insertions, 18 deletions
diff --git a/src/vnet/CMakeLists.txt b/src/vnet/CMakeLists.txt index 0e1d9c44b05..38aeda5be93 100644 --- a/src/vnet/CMakeLists.txt +++ b/src/vnet/CMakeLists.txt @@ -415,6 +415,8 @@ list(APPEND VNET_SOURCES ip/ip_interface.c ip/ip_init.c ip/ip_in_out_acl.c + ip/ip_path_mtu.c + ip/ip_path_mtu_node.c ip/ip_punt_drop.c ip/ip_types.c ip/lookup.c @@ -437,6 +439,7 @@ list(APPEND VNET_MULTIARCH_SOURCES ip/ip6_punt_drop.c ip/punt_node.c ip/ip_in_out_acl.c + ip/ip_path_mtu_node.c ) list(APPEND VNET_HEADERS diff --git a/src/vnet/adj/adj.c b/src/vnet/adj/adj.c index d3890223dc1..8808294f7a6 100644 --- a/src/vnet/adj/adj.c +++ b/src/vnet/adj/adj.c @@ -20,6 +20,7 @@ #include <vnet/adj/adj_mcast.h> #include <vnet/adj/adj_delegate.h> #include <vnet/fib/fib_node_list.h> +#include <vnet/fib/fib_walk.h> /* Adjacency packet/byte counters indexed by adjacency index. */ vlib_combined_counter_main_t adjacency_counters = { @@ -326,6 +327,16 @@ adj_dpo_get_urpf (const dpo_id_t *dpo) return (adj->rewrite_header.sw_if_index); } +u16 +adj_dpo_get_mtu (const dpo_id_t *dpo) +{ + ip_adjacency_t *adj; + + adj = adj_get(dpo->dpoi_index); + + return (adj->rewrite_header.max_l3_packet_bytes); +} + void adj_lock (adj_index_t adj_index) { @@ -465,6 +476,19 @@ adj_mtu_update_walk_cb (adj_index_t ai, vnet_rewrite_update_mtu (vnet_get_main(), adj->ia_link, &adj->rewrite_header); + adj_delegate_adj_modified(adj); + + /** + * Backwalk to all Path MTU trackers, casual like .. + */ + { + fib_node_back_walk_ctx_t bw_ctx = { + .fnbw_reason = FIB_NODE_BW_REASON_FLAG_ADJ_MTU, + }; + + fib_walk_async(FIB_NODE_TYPE_ADJ, ai, + FIB_WALK_PRIORITY_LOW, &bw_ctx); + } return (ADJ_WALK_RC_CONTINUE); } diff --git a/src/vnet/adj/adj.h b/src/vnet/adj/adj.h index 44bb2bd981b..c1922c755ec 100644 --- a/src/vnet/adj/adj.h +++ b/src/vnet/adj/adj.h @@ -373,6 +373,9 @@ STATIC_ASSERT ((STRUCT_OFFSET_OF (ip_adjacency_t, cacheline3) == /* An adj fits into 4 cachelines on your average machine */ STATIC_ASSERT_SIZEOF (ip_adjacency_t, 4 * 64); #endif +STATIC_ASSERT ((STRUCT_OFFSET_OF (ip_adjacency_t, sub_type.nbr.next_hop) == + STRUCT_OFFSET_OF (ip_adjacency_t, sub_type.midchain.next_hop)), + "IP adjacency nbr and midchain offsets don't match"); /** * @brief diff --git a/src/vnet/adj/adj_glean.c b/src/vnet/adj/adj_glean.c index c52e3d09693..e956318a1ff 100644 --- a/src/vnet/adj/adj_glean.c +++ b/src/vnet/adj/adj_glean.c @@ -467,6 +467,7 @@ const static dpo_vft_t adj_glean_dpo_vft = { .dv_unlock = adj_dpo_unlock, .dv_format = format_adj_glean, .dv_get_urpf = adj_dpo_get_urpf, + .dv_get_mtu = adj_dpo_get_mtu, }; /** diff --git a/src/vnet/adj/adj_internal.h b/src/vnet/adj/adj_internal.h index 6639d32267f..253c1e982c1 100644 --- a/src/vnet/adj/adj_internal.h +++ b/src/vnet/adj/adj_internal.h @@ -126,6 +126,7 @@ extern void adj_mcast_remove(fib_protocol_t proto, extern void adj_midchain_teardown(ip_adjacency_t *adj); extern u32 adj_dpo_get_urpf(const dpo_id_t *dpo); +extern u16 adj_dpo_get_mtu(const dpo_id_t *dpo); /* * Adj BFD diff --git a/src/vnet/adj/adj_mcast.c b/src/vnet/adj/adj_mcast.c index 590652244e6..a20f61f6f6b 100644 --- a/src/vnet/adj/adj_mcast.c +++ b/src/vnet/adj/adj_mcast.c @@ -388,12 +388,14 @@ const static dpo_vft_t adj_mcast_dpo_vft = { .dv_unlock = adj_dpo_unlock, .dv_format = format_adj_mcast, .dv_get_urpf = adj_dpo_get_urpf, + .dv_get_mtu = adj_dpo_get_mtu, }; const static dpo_vft_t adj_mcast_midchain_dpo_vft = { .dv_lock = adj_dpo_lock, .dv_unlock = adj_dpo_unlock, .dv_format = format_adj_mcast_midchain, .dv_get_urpf = adj_dpo_get_urpf, + .dv_get_mtu = adj_dpo_get_mtu, }; /** diff --git a/src/vnet/adj/adj_midchain.c b/src/vnet/adj/adj_midchain.c index a21cd21ea25..3d879e9d7fc 100644 --- a/src/vnet/adj/adj_midchain.c +++ b/src/vnet/adj/adj_midchain.c @@ -744,6 +744,7 @@ const static dpo_vft_t adj_midchain_dpo_vft = { .dv_unlock = adj_dpo_unlock, .dv_format = format_adj_midchain, .dv_get_urpf = adj_dpo_get_urpf, + .dv_get_mtu = adj_dpo_get_mtu, }; /** diff --git a/src/vnet/adj/adj_nbr.c b/src/vnet/adj/adj_nbr.c index 921588a7ef7..811d0b8faa2 100644 --- a/src/vnet/adj/adj_nbr.c +++ b/src/vnet/adj/adj_nbr.c @@ -222,6 +222,27 @@ adj_nbr_alloc (fib_protocol_t nh_proto, return (adj); } +void +adj_nbr_set_mtu (adj_index_t adj_index, u16 mtu) +{ + ip_adjacency_t *adj; + + ASSERT(ADJ_INDEX_INVALID != adj_index); + + adj = adj_get(adj_index); + + if (0 == mtu) + vnet_rewrite_update_mtu(vnet_get_main(), adj->ia_link, + &adj->rewrite_header); + else + { + vnet_rewrite_update_mtu(vnet_get_main(), adj->ia_link, + &adj->rewrite_header); + adj->rewrite_header.max_l3_packet_bytes = + clib_min (adj->rewrite_header.max_l3_packet_bytes, mtu); + } +} + /* * adj_nbr_add_or_lock * @@ -268,13 +289,13 @@ adj_nbr_add_or_lock (fib_protocol_t nh_proto, * So ask the interface to do it. */ vnet_update_adjacency_for_sw_interface(vnm, sw_if_index, adj_index); + adj_delegate_adj_created(adj_get(adj_index)); } else { adj_lock(adj_index); } - adj_delegate_adj_created(adj_get(adj_index)); return (adj_index); } @@ -1055,12 +1076,14 @@ const static dpo_vft_t adj_nbr_dpo_vft = { .dv_format = format_adj_nbr, .dv_mem_show = adj_mem_show, .dv_get_urpf = adj_dpo_get_urpf, + .dv_get_mtu = adj_dpo_get_mtu, }; const static dpo_vft_t adj_nbr_incompl_dpo_vft = { .dv_lock = adj_dpo_lock, .dv_unlock = adj_dpo_unlock, .dv_format = format_adj_nbr_incomplete, .dv_get_urpf = adj_dpo_get_urpf, + .dv_get_mtu = adj_dpo_get_mtu, }; /** diff --git a/src/vnet/adj/adj_nbr.h b/src/vnet/adj/adj_nbr.h index 3a89dc89a22..4874e73a45c 100644 --- a/src/vnet/adj/adj_nbr.h +++ b/src/vnet/adj/adj_nbr.h @@ -75,6 +75,13 @@ extern adj_index_t adj_nbr_add_or_lock_w_rewrite(fib_protocol_t nh_proto, const ip46_address_t *nh_addr, u32 sw_if_index, u8 *rewrite); + +/** + * Set the MTU on an adjacency + * + */ +extern void adj_nbr_set_mtu(adj_index_t ai, u16 mtu); + /** * @brief When adding a rewrite to an adjacency these are flags that * apply to that rewrite diff --git a/src/vnet/dpo/dpo.c b/src/vnet/dpo/dpo.c index 1331b5501bc..d8342ff17ae 100644 --- a/src/vnet/dpo/dpo.c +++ b/src/vnet/dpo/dpo.c @@ -23,6 +23,8 @@ * The VLIB graph nodes are graph of types, the DPO graph is a graph of instances. */ +// clang-format off + #include <vnet/dpo/dpo.h> #include <vnet/ip/lookup.h> #include <vnet/ip/format.h> @@ -395,6 +397,18 @@ dpo_get_urpf(const dpo_id_t *dpo) return (~0); } +u16 +dpo_get_mtu(const dpo_id_t *dpo) +{ + if (dpo_id_is_valid(dpo) && + (NULL != dpo_vfts[dpo->dpoi_type].dv_get_mtu)) + { + return (dpo_vfts[dpo->dpoi_type].dv_get_mtu(dpo)); + } + + return (0xffff); +} + static u32 dpo_get_next_node (dpo_type_t child_type, dpo_proto_t child_proto, @@ -649,3 +663,5 @@ VLIB_CLI_COMMAND (show_fib_memory, static) = { .short_help = "show dpo memory", }; /* *INDENT-ON* */ + +// clang-format on diff --git a/src/vnet/dpo/dpo.h b/src/vnet/dpo/dpo.h index ee4990d0058..e9976c2dd87 100644 --- a/src/vnet/dpo/dpo.h +++ b/src/vnet/dpo/dpo.h @@ -24,6 +24,8 @@ * instances. */ +// clang-format off + #ifndef __DPO_H__ #define __DPO_H__ @@ -362,6 +364,16 @@ extern void dpo_stack_from_node(u32 child_node, extern u32 dpo_get_urpf(const dpo_id_t *dpo); /** + * Get the MTU DPO + * + * @param dpo + * The DPO from which to get the MTU + * + * @return MTU (0xffff if something more usefull was unavailable) + */ +extern u16 dpo_get_mtu(const dpo_id_t *dpo); + +/** * @brief A lock function registered for a DPO type */ typedef void (*dpo_lock_fn_t)(dpo_id_t *dpo); @@ -389,6 +401,11 @@ typedef u32* (*dpo_get_next_node_t)(const dpo_id_t *dpo); typedef u32 (*dpo_get_urpf_t)(const dpo_id_t *dpo); /** + * @brief Given a DPO instance return the MTU + */ +typedef u16 (*dpo_get_mtu_t)(const dpo_id_t *dpo); + +/** * @brief Called during FIB interposition when the originally * registered DPO is used to 'clone' an instance for interposition * at a particular location in the FIB graph. @@ -433,6 +450,10 @@ typedef struct dpo_vft_t_ */ dpo_get_urpf_t dv_get_urpf; /** + * Get MTU + */ + dpo_get_mtu_t dv_get_mtu; + /** * Signal on an interposed child that the parent has changed */ dpo_mk_interpose_t dv_mk_interpose; @@ -548,3 +569,5 @@ do { \ if ((YESNO)) vlib_worker_thread_barrier_release((VM)); #endif + +// clang-format on diff --git a/src/vnet/dpo/load_balance.c b/src/vnet/dpo/load_balance.c index fb876a09ec2..a212532dffd 100644 --- a/src/vnet/dpo/load_balance.c +++ b/src/vnet/dpo/load_balance.c @@ -25,6 +25,8 @@ #include <vnet/ip/ip4_inlines.h> #include <vnet/ip/ip6_inlines.h> +// clang-format off + /* * distribution error tolerance for load-balancing */ @@ -918,11 +920,30 @@ load_balance_mem_show (void) load_balance_map_show_mem(); } +static u16 +load_balance_dpo_get_mtu (const dpo_id_t *dpo) +{ + const dpo_id_t *buckets; + load_balance_t *lb; + u16 i, mtu = 0xffff; + + lb = load_balance_get(dpo->dpoi_index); + buckets = load_balance_get_buckets(lb); + + for (i = 0; i < lb->lb_n_buckets; i++) + { + mtu = clib_min (mtu, dpo_get_mtu (&buckets[i])); + } + + return (mtu); +} + const static dpo_vft_t lb_vft = { .dv_lock = load_balance_lock, .dv_unlock = load_balance_unlock, .dv_format = format_load_balance_dpo, .dv_mem_show = load_balance_mem_show, + .dv_get_mtu = load_balance_dpo_get_mtu, }; /** @@ -1323,3 +1344,5 @@ VLIB_REGISTER_NODE (bier_load_balance_node) = { .format_trace = format_bier_load_balance_trace, .sibling_of = "mpls-load-balance", }; + +// clang-format on diff --git a/src/vnet/dpo/mpls_label_dpo.c b/src/vnet/dpo/mpls_label_dpo.c index 683b5449513..b87cb1efcd2 100644 --- a/src/vnet/dpo/mpls_label_dpo.c +++ b/src/vnet/dpo/mpls_label_dpo.c @@ -18,6 +18,8 @@ #include <vnet/mpls/mpls.h> #include <vnet/dpo/drop_dpo.h> +// clang-format off + #ifndef CLIB_MARCH_VARIANT /* * pool of all MPLS Label DPOs @@ -1213,12 +1215,25 @@ mpls_label_interpose (const dpo_id_t *original, mpls_label_dpo_get_index(mld_clone)); } +static u16 +mpls_label_dpo_get_mtu (const dpo_id_t *dpo) +{ + mpls_label_dpo_t *mld; + + mld = mpls_label_dpo_get(dpo->dpoi_index); + + /* return the parent's MTU minus the amount of header + * this DPO imposes */ + return (dpo_get_mtu (&mld->mld_dpo) - sizeof(mpls_label_t) * mld->mld_n_labels); +} + const static dpo_vft_t mld_vft = { .dv_lock = mpls_label_dpo_lock, .dv_unlock = mpls_label_dpo_unlock, .dv_format = format_mpls_label_dpo, .dv_mem_show = mpls_label_dpo_mem_show, .dv_mk_interpose = mpls_label_interpose, + .dv_get_mtu = mpls_label_dpo_get_mtu, }; const static char* const mpls_label_imp_pipe_ip4_nodes[] = @@ -1337,3 +1352,5 @@ mpls_label_dpo_get_type (mpls_label_dpo_flags_t flags) return (mpls_label_dpo_types[flags]); } #endif /* CLIB_MARCH_VARIANT */ + +// clang-format on diff --git a/src/vnet/fib/fib_entry.c b/src/vnet/fib/fib_entry.c index 6edf31b47f3..119a7ac5e77 100644 --- a/src/vnet/fib/fib_entry.c +++ b/src/vnet/fib/fib_entry.c @@ -1362,7 +1362,7 @@ fib_entry_cover_updated (fib_node_index_t fib_entry_index) if (0 == index) { /* - * only the best source gets to set the back walk flags + * only the best source gets to set the install result */ res = fib_entry_src_action_cover_update(fib_entry, esrc); bflags = fib_entry_src_get_flags(esrc); @@ -1370,7 +1370,23 @@ fib_entry_cover_updated (fib_node_index_t fib_entry_index) } else { - fib_entry_src_action_cover_update(fib_entry, esrc); + /* + * contirubting sources can set backwalk flags + */ + if (esrc->fes_flags & FIB_ENTRY_SRC_FLAG_CONTRIBUTING) + { + fib_entry_src_cover_res_t tmp = { + .install = !0, + .bw_reason = FIB_NODE_BW_REASON_FLAG_NONE, + }; + + tmp = fib_entry_src_action_cover_update(fib_entry, esrc); + res.bw_reason |= tmp.bw_reason; + } + else + { + fib_entry_src_action_cover_update(fib_entry, esrc); + } } index++; })); diff --git a/src/vnet/fib/fib_node.h b/src/vnet/fib/fib_node.h index 5cf9182560f..27e67b11c87 100644 --- a/src/vnet/fib/fib_node.h +++ b/src/vnet/fib/fib_node.h @@ -119,6 +119,10 @@ typedef enum fib_node_back_walk_reason_t_ { */ FIB_NODE_BW_REASON_ADJ_UPDATE, /** + * Walk update the adjacency MTU + */ + FIB_NODE_BW_REASON_ADJ_MTU, + /** * Walk to update children to inform them the adjacency is now down. */ FIB_NODE_BW_REASON_ADJ_DOWN, @@ -135,6 +139,7 @@ typedef enum fib_node_back_walk_reason_t_ { [FIB_NODE_BW_REASON_INTERFACE_DOWN] = "if-down", \ [FIB_NODE_BW_REASON_INTERFACE_DELETE] = "if-delete", \ [FIB_NODE_BW_REASON_ADJ_UPDATE] = "adj-update", \ + [FIB_NODE_BW_REASON_ADJ_MTU] = "adj-mtu", \ [FIB_NODE_BW_REASON_ADJ_DOWN] = "adj-down", \ } @@ -154,6 +159,7 @@ typedef enum fib_node_bw_reason_flag_t_ { FIB_NODE_BW_REASON_FLAG_INTERFACE_DOWN = (1 << FIB_NODE_BW_REASON_INTERFACE_DOWN), FIB_NODE_BW_REASON_FLAG_INTERFACE_DELETE = (1 << FIB_NODE_BW_REASON_INTERFACE_DELETE), FIB_NODE_BW_REASON_FLAG_ADJ_UPDATE = (1 << FIB_NODE_BW_REASON_ADJ_UPDATE), + FIB_NODE_BW_REASON_FLAG_ADJ_MTU = (1 << FIB_NODE_BW_REASON_ADJ_MTU), FIB_NODE_BW_REASON_FLAG_ADJ_DOWN = (1 << FIB_NODE_BW_REASON_ADJ_DOWN), } __attribute__ ((packed)) fib_node_bw_reason_flag_t; diff --git a/src/vnet/fib/fib_path.c b/src/vnet/fib/fib_path.c index f48b64484cb..01140d5d0dc 100644 --- a/src/vnet/fib/fib_path.c +++ b/src/vnet/fib/fib_path.c @@ -999,6 +999,7 @@ fib_path_back_walk_notify (fib_node_t *node, &path->fp_dpo); } if ((FIB_NODE_BW_REASON_FLAG_ADJ_UPDATE & ctx->fnbw_reason) || + (FIB_NODE_BW_REASON_FLAG_ADJ_MTU & ctx->fnbw_reason) || (FIB_NODE_BW_REASON_FLAG_ADJ_DOWN & ctx->fnbw_reason)) { /* diff --git a/src/vnet/gre/gre.c b/src/vnet/gre/gre.c index 0669c676bf5..fcdf9c0d6bc 100644 --- a/src/vnet/gre/gre.c +++ b/src/vnet/gre/gre.c @@ -495,8 +495,13 @@ mgre_update_adj (vnet_main_t * vnm, u32 sw_if_index, adj_index_t ai) adj->ia_nh_proto, &adj->sub_type.nbr.next_hop); if (NULL == ne) - // no NHRP entry to provide the next-hop - return; + { + // no TEIB entry to provide the next-hop + adj_nbr_midchain_update_rewrite ( + ai, gre_get_fixup (t->tunnel_dst.fp_proto, adj_get_link_type (ai)), + uword_to_pointer (t->flags, void *), ADJ_FLAG_NONE, NULL); + return; + } mgre_walk_ctx_t ctx = { .t = t, diff --git a/src/vnet/ip/ip.api b/src/vnet/ip/ip.api index 3072e3e7c63..f49fc16bc1d 100644 --- a/src/vnet/ip/ip.api +++ b/src/vnet/ip/ip.api @@ -704,6 +704,63 @@ autoreply define ip_reassembly_enable_disable vl_api_ip_reass_type_t type; }; +/** + @brief Set a Path MTU value. i.e. a MTU value for a given neighbour. + The neighbour can be described as attached (w/ interface and next-hop) + or remote (w/ table_id and next-hop); + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param table_id - table-ID for next-hop + @param nh - Next hop + @param path_mtu - value to set, 0 is disable. +*/ +typedef ip_path_mtu +{ + u32 client_index; + u32 context; + u32 table_id; + vl_api_address_t nh; + u16 path_mtu; +}; +autoreply define ip_path_mtu_update +{ + u32 client_index; + u32 context; + vl_api_ip_path_mtu_t pmtu; +}; +define ip_path_mtu_get +{ + u32 client_index; + u32 context; + u32 cursor; +}; +define ip_path_mtu_get_reply +{ + u32 context; + i32 retval; + u32 cursor; +}; +define ip_path_mtu_details +{ + u32 context; + vl_api_ip_path_mtu_t pmtu; +}; +service { + rpc ip_path_mtu_get returns ip_path_mtu_get_reply + stream ip_path_mtu_details; +}; + +autoreply define ip_path_mtu_replace_begin +{ + u32 client_index; + u32 context; +}; +autoreply define ip_path_mtu_replace_end +{ + u32 client_index; + u32 context; +}; + /* * Local Variables: * eval: (c-set-style "gnu") diff --git a/src/vnet/ip/ip_api.c b/src/vnet/ip/ip_api.c index 3bf404baadf..5b87f7cc86a 100644 --- a/src/vnet/ip/ip_api.c +++ b/src/vnet/ip/ip_api.c @@ -28,6 +28,7 @@ #include <vnet/ip/ip_types_api.h> #include <vnet/ip/ip_punt_drop.h> #include <vnet/ip/ip_types_api.h> +#include <vnet/ip/ip_path_mtu.h> #include <vnet/fib/fib_table.h> #include <vnet/fib/fib_api.h> #include <vnet/ethernet/arp_packet.h> @@ -104,7 +105,11 @@ _ (IP_REASSEMBLY_SET, ip_reassembly_set) \ _ (IP_REASSEMBLY_GET, ip_reassembly_get) \ _ (IP_REASSEMBLY_ENABLE_DISABLE, ip_reassembly_enable_disable) \ - _ (IP_PUNT_REDIRECT_DUMP, ip_punt_redirect_dump) + _ (IP_PUNT_REDIRECT_DUMP, ip_punt_redirect_dump) \ + _ (IP_PATH_MTU_UPDATE, ip_path_mtu_update) \ + _ (IP_PATH_MTU_REPLACE_BEGIN, ip_path_mtu_replace_begin) \ + _ (IP_PATH_MTU_REPLACE_END, ip_path_mtu_replace_end) \ + _ (IP_PATH_MTU_GET, ip_path_mtu_get) static void vl_api_sw_interface_ip6_enable_disable_t_handler @@ -1134,18 +1139,18 @@ static void REPLY_MACRO (VL_API_IP_CONTAINER_PROXY_ADD_DEL_REPLY); } -typedef struct ip_container_proxy_walk_ctx_t_ +typedef struct ip_walk_ctx_t_ { vl_api_registration_t *reg; u32 context; -} ip_container_proxy_walk_ctx_t; +} ip_walk_ctx_t; static int ip_container_proxy_send_details (const fib_prefix_t * pfx, u32 sw_if_index, void *args) { vl_api_ip_container_proxy_details_t *mp; - ip_container_proxy_walk_ctx_t *ctx = args; + ip_walk_ctx_t *ctx = args; mp = vl_msg_api_alloc (sizeof (*mp)); if (!mp) @@ -1173,7 +1178,7 @@ vl_api_ip_container_proxy_dump_t_handler (vl_api_ip_container_proxy_dump_t * if (!reg) return; - ip_container_proxy_walk_ctx_t ctx = { + ip_walk_ctx_t ctx = { .context = mp->context, .reg = reg, }; @@ -1624,21 +1629,15 @@ void REPLY_MACRO (VL_API_IP_REASSEMBLY_ENABLE_DISABLE_REPLY); } -typedef struct ip_punt_redirect_walk_ctx_t_ -{ - vl_api_registration_t *reg; - u32 context; -} ip_punt_redirect_walk_ctx_t; - static walk_rc_t send_ip_punt_redirect_details (u32 rx_sw_if_index, const ip_punt_redirect_rx_t * ipr, void *arg) { - ip_punt_redirect_walk_ctx_t *ctx = arg; vl_api_ip_punt_redirect_details_t *mp; fib_path_encode_ctx_t path_ctx = { .rpaths = NULL, }; + ip_walk_ctx_t *ctx = arg; mp = vl_msg_api_alloc (sizeof (*mp)); if (!mp) @@ -1676,7 +1675,7 @@ vl_api_ip_punt_redirect_dump_t_handler (vl_api_ip_punt_redirect_dump_t * mp) if (mp->is_ipv6 == 1) fproto = FIB_PROTOCOL_IP6; - ip_punt_redirect_walk_ctx_t ctx = { + ip_walk_ctx_t ctx = { .reg = reg, .context = mp->context, }; @@ -1699,6 +1698,73 @@ vl_api_ip_punt_redirect_dump_t_handler (vl_api_ip_punt_redirect_dump_t * mp) ip_punt_redirect_walk (fproto, send_ip_punt_redirect_details, &ctx); } +void +vl_api_ip_path_mtu_update_t_handler (vl_api_ip_path_mtu_update_t *mp) +{ + vl_api_ip_path_mtu_update_reply_t *rmp; + ip_address_t nh; + int rv = 0; + + ip_address_decode2 (&mp->pmtu.nh, &nh); + + rv = ip_path_mtu_update (&nh, ntohl (mp->pmtu.table_id), + ntohs (mp->pmtu.path_mtu)); + + REPLY_MACRO (VL_API_IP_PATH_MTU_UPDATE_REPLY); +} + +void +vl_api_ip_path_mtu_replace_begin_t_handler ( + vl_api_ip_path_mtu_replace_begin_t *mp) +{ + vl_api_ip_path_mtu_replace_begin_reply_t *rmp; + int rv; + + rv = ip_path_mtu_replace_begin (); + + REPLY_MACRO (VL_API_IP_PATH_MTU_REPLACE_BEGIN_REPLY); +} + +void +vl_api_ip_path_mtu_replace_end_t_handler (vl_api_ip_path_mtu_replace_end_t *mp) +{ + vl_api_ip_path_mtu_replace_end_reply_t *rmp; + int rv; + + rv = ip_path_mtu_replace_end (); + + REPLY_MACRO (VL_API_IP_PATH_MTU_REPLACE_END_REPLY); +} + +static void +send_ip_path_mtu_details (index_t ipti, vl_api_registration_t *rp, u32 context) +{ + vl_api_ip_path_mtu_details_t *rmp; + ip_address_t ip; + ip_pmtu_t *ipt; + + ipt = ip_path_mtu_get (ipti); + + REPLY_MACRO_DETAILS4 (VL_API_IP_PATH_MTU_DETAILS, rp, context, ({ + ip_pmtu_get_ip (ipt, &ip); + ip_address_encode2 (&ip, &rmp->pmtu.nh); + rmp->pmtu.table_id = + htonl (ip_pmtu_get_table_id (ipt)); + rmp->pmtu.path_mtu = htons (ipt->ipt_cfg_pmtu); + })); +} + +static void +vl_api_ip_path_mtu_get_t_handler (vl_api_ip_path_mtu_get_t *mp) +{ + vl_api_ip_path_mtu_get_reply_t *rmp; + i32 rv = 0; + + REPLY_AND_DETAILS_MACRO ( + VL_API_IP_PATH_MTU_GET_REPLY, ip_pmtu_pool, + ({ send_ip_path_mtu_details (cursor, rp, mp->context); })); +} + #define vl_msg_name_crc_list #include <vnet/ip/ip.api.h> #undef vl_msg_name_crc_list diff --git a/src/vnet/ip/ip_path_mtu.c b/src/vnet/ip/ip_path_mtu.c new file mode 100644 index 00000000000..38adb44065b --- /dev/null +++ b/src/vnet/ip/ip_path_mtu.c @@ -0,0 +1,883 @@ +/* + *------------------------------------------------------------------ + * ip_path_mtu.c + * + * Copyright (c) 2021 Graphiant. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#include <vnet/ip/ip_path_mtu.h> +#include <vnet/ip/ip_frag.h> +#include <vnet/adj/adj_delegate.h> +#include <vnet/adj/adj_nbr.h> +#include <vnet/fib/fib_table.h> +#include <vnet/fib/fib_entry_track.h> + +#include <vnet/dpo/drop_dpo.h> + +/** + * Path MTU + * + * A path is a peer. A peer is known by an IP address (in a table). + * Insert a DPO in the forwarding chain for the peer to perform the + * fragmentation. + * For attached peers, all traffic will use the peer's adjacency, there + * is already an MTU chekc in the adjacency (for the link's MTU) so as an + * optimisation, instead of using a DPO, we add a delegate to the adjacency + * to set the adjacency's MTU to the path MTU. + */ + +/** + * the logger + */ +static vlib_log_class_t ip_pmtu_logger; + +static adj_delegate_type_t ip_pmtu_adj_delegate_type; +static fib_source_t ip_pmtu_source; + +/** + * DPO pool + */ +ip_pmtu_dpo_t *ip_pmtu_dpo_pool; + +/** + * DPO type registered for these GBP FWD + */ +static dpo_type_t ip_pmtu_dpo_type; + +/** + * Fib node type for the tracker + */ +static fib_node_type_t ip_pmtu_fib_type; + +/** + * Path MTU tracker pool + */ +ip_pmtu_t *ip_pmtu_pool; + +/** + * Delegate added to adjacencies to track path MTU + */ +typedef struct ip_path_mtu_adj_delegate_t_ +{ + u16 pmtu; +} ip_path_mtu_adj_delegate_t; + +static ip_path_mtu_adj_delegate_t *ip_path_mtu_adj_delegate_pool; + +/* DB of all FIB PMTU settings */ +typedef struct ip_pmtu_key_t_ +{ + ip46_address_t nh; + u32 table_id; + fib_protocol_t fproto; +} __clib_packed ip_pmtu_key_t; + +static uword *ip_pmtu_db; + +#define IP_PMTU_TRKR_DBG(_ipt, _fmt, _args...) \ + { \ + vlib_log_debug (ip_pmtu_logger, "[%U]: " _fmt ": ", format_ip_pmtu, \ + _ipt - ip_pmtu_pool, ##_args); \ + } +#define IP_PMTU_DBG(_fmt, _args...) \ + { \ + vlib_log_debug (ip_pmtu_logger, _fmt ": ", ##_args); \ + } + +static u8 * +format_ip_pmtu_flags (u8 *s, va_list *ap) +{ + ip_pmtu_flags_t f = va_arg (*ap, ip_pmtu_flags_t); + + if (0) + ; +#define _(a, b, c) else if (f & IP_PMTU_FLAG_##a) s = format (s, "%s ", c); + foreach_ip_pmtu_flag +#undef _ + + return (s); +} + +u32 +ip_pmtu_get_table_id (const ip_pmtu_t *ipt) +{ + const fib_prefix_t *pfx; + u32 fib_index; + + pfx = fib_entry_get_prefix (ipt->ipt_fib_entry); + fib_index = fib_entry_get_fib_index (ipt->ipt_fib_entry); + + return (fib_table_get_table_id (fib_index, pfx->fp_proto)); +} + +void +ip_pmtu_get_ip (const ip_pmtu_t *ipt, ip_address_t *ip) +{ + const fib_prefix_t *pfx; + + pfx = fib_entry_get_prefix (ipt->ipt_fib_entry); + ip_address_from_46 (&pfx->fp_addr, pfx->fp_proto, ip); +} + +static u8 * +format_ip_pmtu (u8 *s, va_list *ap) +{ + ip_pmtu_t *ipt; + index_t ipti = va_arg (*ap, index_t); + const fib_prefix_t *pfx; + u32 fib_index; + + ipt = pool_elt_at_index (ip_pmtu_pool, ipti); + pfx = fib_entry_get_prefix (ipt->ipt_fib_entry); + fib_index = fib_entry_get_fib_index (ipt->ipt_fib_entry); + + s = + format (s, "[%d] [tbl:[%d:%d]] %U pmtu:[cfg:%d, oper:%d, parent:%d] [%U]", + ipti, ip_pmtu_get_table_id (ipt), fib_index, format_fib_prefix, + pfx, ipt->ipt_cfg_pmtu, ipt->ipt_oper_pmtu, ipt->ipt_parent_pmtu, + format_ip_pmtu_flags, ipt->ipt_flags); + + return (s); +} + +static u8 * +format_ip_path_mtu_adj_delegate (const adj_delegate_t *aed, u8 *s) +{ + ip_path_mtu_adj_delegate_t *ip_adj; + + ip_adj = pool_elt_at_index (ip_path_mtu_adj_delegate_pool, aed->ad_index); + + s = format (s, "IP path-MTU: %d", ip_adj->pmtu); + + return (s); +} + +static void +ip_pmtu_adj_delegate_adj_created (adj_index_t ai) +{ + ip_path_mtu_adj_delegate_t *ipp_ad; + const ip_pmtu_t *ipt; + ip_adjacency_t *adj; + u32 table_id; + uword *p; + + adj = adj_get (ai); + + switch (adj->lookup_next_index) + { + case IP_LOOKUP_NEXT_DROP: + case IP_LOOKUP_NEXT_PUNT: + case IP_LOOKUP_NEXT_LOCAL: + case IP_LOOKUP_NEXT_GLEAN: + case IP_LOOKUP_NEXT_MCAST: + case IP_LOOKUP_NEXT_BCAST: + case IP_LOOKUP_NEXT_MCAST_MIDCHAIN: + case IP_LOOKUP_NEXT_ICMP_ERROR: + case IP_LOOKUP_N_NEXT: + return; + + case IP_LOOKUP_NEXT_ARP: + case IP_LOOKUP_NEXT_REWRITE: + case IP_LOOKUP_NEXT_MIDCHAIN: + break; + } + + table_id = fib_table_get_table_id_for_sw_if_index ( + adj->ia_nh_proto, adj->rewrite_header.sw_if_index); + + ip_pmtu_key_t key = { + .nh = adj->sub_type.nbr.next_hop, + .table_id = table_id, + .fproto = adj->ia_nh_proto, + }; + + p = hash_get_mem (ip_pmtu_db, &key); + + if (NULL == p) + return; + + ipt = pool_elt_at_index (ip_pmtu_pool, p[0]); + + pool_get (ip_path_mtu_adj_delegate_pool, ipp_ad); + ipp_ad->pmtu = ipt->ipt_cfg_pmtu; + + adj_delegate_add (adj, ip_pmtu_adj_delegate_type, + ipp_ad - ip_path_mtu_adj_delegate_pool); + + adj_nbr_set_mtu (ai, ipp_ad->pmtu); + + IP_PMTU_TRKR_DBG (ipt, "adj-added:", ai); +} + +static void +ip_pmtu_adj_delegate_adj_deleted (adj_delegate_t *ad) +{ + pool_put_index (ip_path_mtu_adj_delegate_pool, ad->ad_index); +} + +static void +ip_pmtu_adj_delegate_adj_modified (adj_delegate_t *ad) +{ + ip_path_mtu_adj_delegate_t *ipp_ad; + + ipp_ad = pool_elt_at_index (ip_path_mtu_adj_delegate_pool, ad->ad_index); + + adj_nbr_set_mtu (ad->ad_adj_index, ipp_ad->pmtu); +} + +const adj_delegate_vft_t ip_path_adj_delegate_vft = { + .adv_format = format_ip_path_mtu_adj_delegate, + .adv_adj_deleted = ip_pmtu_adj_delegate_adj_deleted, + .adv_adj_modified = ip_pmtu_adj_delegate_adj_modified, + .adv_adj_created = ip_pmtu_adj_delegate_adj_created, +}; + +static bool +ip_path_mtu_value_invalid (u16 pmtu) +{ + return (pmtu == 0 || pmtu == 0xffff); +} + +static adj_walk_rc_t +ip_ptmu_adj_walk_remove (adj_index_t ai, void *ctx) +{ + adj_delegate_t *ad; + + ad = adj_delegate_get (adj_get (ai), ip_pmtu_adj_delegate_type); + + if (ad) + { + adj_nbr_set_mtu (ai, 0); + + pool_put_index (ip_path_mtu_adj_delegate_pool, ad->ad_index); + adj_delegate_remove (ai, ip_pmtu_adj_delegate_type); + } + return (ADJ_WALK_RC_CONTINUE); +} + +static adj_walk_rc_t +ip_ptmu_adj_walk_update (adj_index_t ai, void *ctx) +{ + ip_path_mtu_adj_delegate_t *ipp_ad; + adj_delegate_t *ad; + u16 *pmtup; + + pmtup = ctx; + ad = adj_delegate_get (adj_get (ai), ip_pmtu_adj_delegate_type); + + if (ad) + ipp_ad = pool_elt_at_index (ip_path_mtu_adj_delegate_pool, ad->ad_index); + else + { + pool_get (ip_path_mtu_adj_delegate_pool, ipp_ad); + + adj_delegate_add (adj_get (ai), ip_pmtu_adj_delegate_type, + ipp_ad - ip_path_mtu_adj_delegate_pool); + } + + ipp_ad->pmtu = *pmtup; + + adj_nbr_set_mtu (ai, ipp_ad->pmtu); + + return (ADJ_WALK_RC_CONTINUE); +} + +static ip_pmtu_dpo_t * +ip_pmtu_dpo_alloc (void) +{ + ip_pmtu_dpo_t *ipm; + + pool_get_aligned_zero (ip_pmtu_dpo_pool, ipm, sizeof (ip_pmtu_dpo_t)); + + return (ipm); +} + +static ip_pmtu_dpo_t * +ip_pmtu_dpo_get_from_dpo (const dpo_id_t *dpo) +{ + ASSERT (ip_pmtu_dpo_type == dpo->dpoi_type); + + return (ip_pmtu_dpo_get (dpo->dpoi_index)); +} + +static index_t +ip_pmtu_dpo_get_index (ip_pmtu_dpo_t *ipm) +{ + return (ipm - ip_pmtu_dpo_pool); +} + +static void +ip_pmtu_dpo_lock (dpo_id_t *dpo) +{ + ip_pmtu_dpo_t *ipm; + + ipm = ip_pmtu_dpo_get_from_dpo (dpo); + ipm->ipm_locks++; +} + +static void +ip_pmtu_dpo_unlock (dpo_id_t *dpo) +{ + ip_pmtu_dpo_t *ipm; + + ipm = ip_pmtu_dpo_get_from_dpo (dpo); + ipm->ipm_locks--; + + if (0 == ipm->ipm_locks) + { + dpo_reset (&ipm->ipm_dpo); + pool_put (ip_pmtu_dpo_pool, ipm); + } +} + +static u32 +ip_pmtu_dpo_get_urpf (const dpo_id_t *dpo) +{ + ip_pmtu_dpo_t *ipm; + + ipm = ip_pmtu_dpo_get_from_dpo (dpo); + + return (dpo_get_urpf (&ipm->ipm_dpo)); +} + +void +ip_pmtu_dpo_add_or_lock (fib_protocol_t fproto, u16 pmtu, dpo_id_t *dpo) +{ + ip_pmtu_dpo_t *ipm; + dpo_id_t parent = DPO_INVALID; + + ipm = ip_pmtu_dpo_alloc (); + + ipm->ipm_proto = fib_proto_to_dpo (fproto); + ipm->ipm_pmtu = pmtu; + + dpo_copy (&parent, drop_dpo_get (ipm->ipm_proto)); + dpo_stack (ip_pmtu_dpo_type, ipm->ipm_proto, &ipm->ipm_dpo, &parent); + dpo_set (dpo, ip_pmtu_dpo_type, ipm->ipm_proto, ip_pmtu_dpo_get_index (ipm)); +} + +u8 * +format_ip_pmtu_dpo (u8 *s, va_list *ap) +{ + index_t index = va_arg (*ap, index_t); + u32 indent = va_arg (*ap, u32); + ip_pmtu_dpo_t *ipm = ip_pmtu_dpo_get (index); + + s = format (s, "ip-pmtu-dpo: %U, mtu:%d", format_dpo_proto, ipm->ipm_proto, + ipm->ipm_pmtu); + s = format (s, "\n%U", format_white_space, indent + 2); + s = format (s, "%U", format_dpo_id, &ipm->ipm_dpo, indent + 4); + + return (s); +} + +/** + * Interpose a path MTU DPO + */ +static void +ip_pmtu_dpo_interpose (const dpo_id_t *original, const dpo_id_t *parent, + dpo_id_t *clone) +{ + ip_pmtu_dpo_t *ipm, *ipm_clone; + + ipm_clone = ip_pmtu_dpo_alloc (); + ipm = ip_pmtu_dpo_get (original->dpoi_index); + + ipm_clone->ipm_proto = ipm->ipm_proto; + ipm_clone->ipm_pmtu = ipm->ipm_pmtu; + + dpo_stack (ip_pmtu_dpo_type, ipm_clone->ipm_proto, &ipm_clone->ipm_dpo, + parent); + dpo_set (clone, ip_pmtu_dpo_type, ipm_clone->ipm_proto, + ip_pmtu_dpo_get_index (ipm_clone)); +} + +static u16 +ip_pmtu_dpo_get_mtu (const dpo_id_t *dpo) +{ + ip_pmtu_dpo_t *ipd; + + ipd = pool_elt_at_index (ip_pmtu_dpo_pool, dpo->dpoi_index); + + return (ipd->ipm_pmtu); +} + +const static dpo_vft_t ip_pmtu_dpo_vft = { + .dv_lock = ip_pmtu_dpo_lock, + .dv_unlock = ip_pmtu_dpo_unlock, + .dv_format = format_ip_pmtu_dpo, + .dv_get_urpf = ip_pmtu_dpo_get_urpf, + .dv_mk_interpose = ip_pmtu_dpo_interpose, + .dv_get_mtu = ip_pmtu_dpo_get_mtu, +}; + +/** + * @brief The per-protocol VLIB graph nodes that are assigned to a glean + * object. + * + * this means that these graph nodes are ones from which a glean is the + * parent object in the DPO-graph. + */ +const static char *const ip_pmtu_dpo_ip4_nodes[] = { + "ip4-pmtu-dpo", + NULL, +}; + +const static char *const ip_pmtu_dpo_ip6_nodes[] = { + "ip6-pmtu-dpo", + NULL, +}; + +const static char *const *const ip_pmtu_dpo_nodes[DPO_PROTO_NUM] = { + [DPO_PROTO_IP4] = ip_pmtu_dpo_ip4_nodes, + [DPO_PROTO_IP6] = ip_pmtu_dpo_ip6_nodes, +}; + +static bool +ip_mtu_fib_entry_is_attached (fib_node_index_t fib_entry) +{ + const fib_prefix_t *pfx; + u32 cover, fib_index; + + fib_index = fib_entry_get_fib_index (fib_entry); + pfx = fib_entry_get_prefix (fib_entry); + + /* + * If the tracked prefix's cover is attached, then all packets that + * are forwarded to this neighbour will use the adjacency, this is a + * more efficient place to perform the MTU check and fragging + */ + cover = fib_table_get_less_specific (fib_index, pfx); + + return (FIB_ENTRY_FLAG_ATTACHED & fib_entry_get_flags (cover) || + FIB_ENTRY_FLAG_ATTACHED & fib_entry_get_flags (fib_entry)); +} + +static index_t +ip_pmtu_alloc (u32 fib_index, const fib_prefix_t *pfx, + const ip_pmtu_key_t *key, u16 pmtu) +{ + dpo_id_t ip_dpo = DPO_INVALID; + ip_pmtu_t *ipt; + fib_node_index_t cover; + const dpo_id_t *lb_dpo; + index_t ipti; + + pool_get (ip_pmtu_pool, ipt); + fib_node_init (&(ipt->ipt_node), ip_pmtu_fib_type); + + ipti = ipt - ip_pmtu_pool; + hash_set_mem_alloc (&ip_pmtu_db, key, ipti); + + ipt->ipt_cfg_pmtu = pmtu; + ipt->ipt_fib_entry = fib_entry_track (fib_index, pfx, ip_pmtu_fib_type, ipti, + &ipt->ipt_sibling); + + /* + * If the tracked prefix's cover is attached, then all packets that + * are forwarded to this neighbour will use the adjacency, this is a + * more efficient place to perform the MTU check and fragging + */ + cover = fib_table_get_less_specific (fib_index, pfx); + + if (ip_mtu_fib_entry_is_attached (ipt->ipt_fib_entry)) + { + u32 sw_if_index; + + ipt->ipt_flags |= IP_PMTU_FLAG_ATTACHED; + ipt->ipt_oper_pmtu = ipt->ipt_cfg_pmtu; + + sw_if_index = fib_entry_get_resolving_interface (cover); + + /* walk all adjs to add/update delegate */ + adj_nbr_walk_nh (sw_if_index, pfx->fp_proto, &pfx->fp_addr, + ip_ptmu_adj_walk_update, &ipt->ipt_oper_pmtu); + } + else + { + ipt->ipt_flags |= IP_PMTU_FLAG_REMOTE; + + lb_dpo = fib_entry_contribute_ip_forwarding (ipt->ipt_fib_entry); + + ipt->ipt_oper_pmtu = clib_min (dpo_get_mtu (lb_dpo), ipt->ipt_cfg_pmtu); + + /* + * interpose a policy DPO from the nh so that MTU is applied + */ + ip_pmtu_dpo_add_or_lock (pfx->fp_proto, ipt->ipt_oper_pmtu, &ip_dpo); + + fib_table_entry_special_dpo_add (fib_index, pfx, ip_pmtu_source, + FIB_ENTRY_FLAG_INTERPOSE, &ip_dpo); + dpo_reset (&ip_dpo); + } + + IP_PMTU_TRKR_DBG (ipt, "create"); + + return (ipti); +} + +static void +ip_pmtu_stack (ip_pmtu_t *ipt) +{ + bool was_attached, is_attached; + const fib_prefix_t *pfx; + u32 fib_index; + + pfx = fib_entry_get_prefix (ipt->ipt_fib_entry); + fib_index = fib_entry_get_fib_index (ipt->ipt_fib_entry); + + was_attached = !!(ipt->ipt_flags & IP_PMTU_FLAG_ATTACHED); + is_attached = ip_mtu_fib_entry_is_attached (ipt->ipt_fib_entry); + + if (was_attached && !is_attached) + { + /* transition from attached to remote - walk all adjs to remove delegate + */ + adj_nbr_walk_nh (fib_entry_get_resolving_interface (ipt->ipt_fib_entry), + pfx->fp_proto, &pfx->fp_addr, ip_ptmu_adj_walk_remove, + &ipt->ipt_oper_pmtu); + ipt->ipt_flags &= ~IP_PMTU_FLAG_ATTACHED; + } + if (!was_attached && is_attached) + { + /* transition from remote to attached - remove the DPO */ + fib_table_entry_special_remove (fib_index, pfx, ip_pmtu_source); + ipt->ipt_flags &= ~IP_PMTU_FLAG_REMOTE; + } + + if (is_attached) + { + /* walk all adjs to add/update delegate */ + ipt->ipt_oper_pmtu = ipt->ipt_cfg_pmtu; + adj_nbr_walk_nh (fib_entry_get_resolving_interface (ipt->ipt_fib_entry), + pfx->fp_proto, &pfx->fp_addr, ip_ptmu_adj_walk_update, + &ipt->ipt_oper_pmtu); + ipt->ipt_flags |= IP_PMTU_FLAG_ATTACHED; + } + else + { + const dpo_id_t *lb_dpo; + u16 dpo_mtu; + + fib_table_entry_special_remove (fib_index, pfx, ip_pmtu_source); + + ipt->ipt_flags |= IP_PMTU_FLAG_REMOTE; + lb_dpo = fib_entry_contribute_ip_forwarding (ipt->ipt_fib_entry); + dpo_mtu = dpo_get_mtu (lb_dpo); + + ipt->ipt_oper_pmtu = clib_min (dpo_mtu, ipt->ipt_cfg_pmtu); + + /* + * if the configured path-MTU is less that the egress/interface then + * interpose a policy DPO from the nh so that MTU is applied + */ + if (ipt->ipt_oper_pmtu < dpo_mtu) + { + dpo_id_t ip_dpo = DPO_INVALID; + + ip_pmtu_dpo_add_or_lock (pfx->fp_proto, ipt->ipt_oper_pmtu, &ip_dpo); + + fib_table_entry_special_dpo_update ( + fib_index, pfx, ip_pmtu_source, FIB_ENTRY_FLAG_INTERPOSE, &ip_dpo); + dpo_reset (&ip_dpo); + } + } + IP_PMTU_TRKR_DBG (ipt, "stack"); +} + +static void +ip_pmtu_update (index_t ipti, u16 pmtu) +{ + ip_pmtu_t *ipt; + + ipt = pool_elt_at_index (ip_pmtu_pool, ipti); + ipt->ipt_flags &= ~IP_PMTU_FLAG_STALE; + ipt->ipt_cfg_pmtu = pmtu; + + ip_pmtu_stack (ipt); +} + +static index_t +ip_pmtu_destroy (index_t ipti, const ip_pmtu_key_t *key) +{ + ip_pmtu_t *ipt; + const fib_prefix_t *pfx; + + ipt = pool_elt_at_index (ip_pmtu_pool, ipti); + pfx = fib_entry_get_prefix (ipt->ipt_fib_entry); + + IP_PMTU_TRKR_DBG (ipt, "destroy"); + + if (ipt->ipt_flags & IP_PMTU_FLAG_REMOTE) + fib_table_entry_special_remove ( + fib_entry_get_fib_index (ipt->ipt_fib_entry), pfx, ip_pmtu_source); + else + /* remove the delegate from all the adjacencies */ + adj_nbr_walk_nh (fib_entry_get_resolving_interface (ipt->ipt_fib_entry), + pfx->fp_proto, &pfx->fp_addr, ip_ptmu_adj_walk_remove, + NULL); + + /* + * Drop the fib entry we're tracking + */ + fib_entry_untrack (ipt->ipt_fib_entry, ipt->ipt_sibling); + + /* + * remove from DB and return to pool + */ + hash_unset_mem_free (&ip_pmtu_db, key); + pool_put (ip_pmtu_pool, ipt); + + return (ipti); +} + +int +ip_path_mtu_update (const ip_address_t *nh, u32 table_id, u16 pmtu) +{ + fib_prefix_t pfx; + u32 fib_index; + uword *p; + + ip_address_to_fib_prefix (nh, &pfx); + fib_index = fib_table_find (pfx.fp_proto, table_id); + + if (~0 == fib_index) + return (VNET_API_ERROR_NO_SUCH_TABLE); + + ip_pmtu_key_t key = { + .fproto = pfx.fp_proto, + .table_id = table_id, + .nh = pfx.fp_addr, + }; + + p = hash_get_mem (ip_pmtu_db, &key); + + if (!ip_path_mtu_value_invalid (pmtu)) + { + /* Add or update of path MTU */ + if (NULL == p) + ip_pmtu_alloc (fib_index, &pfx, &key, pmtu); + else + ip_pmtu_update (p[0], pmtu); + } + else + { + if (NULL != p) + ip_pmtu_destroy (p[0], &key); + } + + return (0); +} + +static walk_rc_t +ip_path_mtu_walk_mark (index_t ipti, void *ctx) +{ + ip_pmtu_t *ipt; + + ipt = ip_path_mtu_get (ipti); + + ipt->ipt_flags |= IP_PMTU_FLAG_STALE; + + return (WALK_CONTINUE); +} + +typedef struct ip_path_mtu_walk_sweep_ctx_t_ +{ + index_t *indicies; +} ip_path_mtu_walk_sweep_ctx_t; + +static walk_rc_t +ip_path_mtu_walk_sweep (index_t ipti, void *arg) +{ + ip_path_mtu_walk_sweep_ctx_t *ctx = arg; + ip_pmtu_t *ipt; + + ipt = ip_path_mtu_get (ipti); + + if (ipt->ipt_flags & IP_PMTU_FLAG_STALE) + { + vec_add1 (ctx->indicies, ipti); + } + + return (WALK_CONTINUE); +} + +int +ip_path_mtu_replace_begin (void) +{ + IP_PMTU_DBG ("replace-begin"); + + ip_path_mtu_walk (ip_path_mtu_walk_mark, NULL); + + return (0); +} + +int +ip_path_mtu_replace_end (void) +{ + index_t *ipti; + + IP_PMTU_DBG ("replace-end"); + + /* + * not safe to walk the pool whilst deleting, so create + * temporary storage of stale entries + */ + ip_path_mtu_walk_sweep_ctx_t ctx = { + .indicies = NULL, + }; + + ip_path_mtu_walk (ip_path_mtu_walk_sweep, &ctx); + + vec_foreach (ipti, ctx.indicies) + { + ip_pmtu_t *ipt; + ip_address_t ip; + + ipt = ip_path_mtu_get (*ipti); + ip_pmtu_get_ip (ipt, &ip); + ip_path_mtu_update (&ip, ip_pmtu_get_table_id (ipt), 0); + } + + vec_free (ctx.indicies); + + return (0); +} + +void +ip_path_mtu_walk (ip_path_mtu_walk_t fn, void *ctx) +{ + index_t ipmi; + + pool_foreach_index (ipmi, ip_pmtu_pool) + { + if (WALK_STOP == fn (ipmi, ctx)) + break; + } +} + +static fib_node_t * +ip_pmtu_get_node (fib_node_index_t index) +{ + ip_pmtu_t *ipt; + + ipt = pool_elt_at_index (ip_pmtu_pool, index); + + return (&(ipt->ipt_node)); +} + +static ip_pmtu_t * +ip_pmtu_get_from_node (fib_node_t *node) +{ + return ( + (ip_pmtu_t *) (((char *) node) - STRUCT_OFFSET_OF (ip_pmtu_t, ipt_node))); +} + +static void +ip_pmtu_last_lock_gone (fib_node_t *node) +{ + /* + * the lifetime of the entry is managed by the API. + */ + ASSERT (0); +} + +/* + * A back walk has reached this BIER entry + */ +static fib_node_back_walk_rc_t +ip_pmtu_back_walk_notify (fib_node_t *node, fib_node_back_walk_ctx_t *ctx) +{ + /* + * re-populate the ECMP tables with new choices + */ + ip_pmtu_t *ipr = ip_pmtu_get_from_node (node); + + ip_pmtu_stack (ipr); + + /* + * no need to propagate further up the graph, since there's nothing there + */ + return (FIB_NODE_BACK_WALK_CONTINUE); +} + +static const fib_node_vft_t ip_ptmu_fib_node_vft = { + .fnv_get = ip_pmtu_get_node, + .fnv_last_lock = ip_pmtu_last_lock_gone, + .fnv_back_walk = ip_pmtu_back_walk_notify, +}; + +static clib_error_t * +ip_path_module_init (vlib_main_t *vm) +{ + ip_pmtu_adj_delegate_type = + adj_delegate_register_new_type (&ip_path_adj_delegate_vft); + ip_pmtu_source = fib_source_allocate ("path-mtu", FIB_SOURCE_PRIORITY_HI, + FIB_SOURCE_BH_SIMPLE); + ip_pmtu_fib_type = fib_node_register_new_type (&ip_ptmu_fib_node_vft); + + ip_pmtu_db = hash_create_mem (0, sizeof (ip_pmtu_key_t), sizeof (index_t)); + ip_pmtu_logger = vlib_log_register_class ("ip", "pmtu"); + ip_pmtu_dpo_type = + dpo_register_new_type (&ip_pmtu_dpo_vft, ip_pmtu_dpo_nodes); + + return (NULL); +} + +VLIB_INIT_FUNCTION (ip_path_module_init); + +static clib_error_t * +show_ip_pmtu_command (vlib_main_t *vm, unformat_input_t *input, + vlib_cli_command_t *cmd) +{ + index_t ipti; + + if (unformat (input, "%d", &ipti)) + { + /* + * show one in detail + */ + if (!pool_is_free_index (ip_pmtu_pool, ipti)) + vlib_cli_output (vm, "%U", format_ip_pmtu, ipti); + else + vlib_cli_output (vm, "entry %d invalid", ipti); + } + else + { + /* + * show all + */ + pool_foreach_index (ipti, ip_pmtu_pool) + { + vlib_cli_output (vm, "%U", format_ip_pmtu, ipti); + } + } + + return (NULL); +} + +VLIB_CLI_COMMAND (show_fib_entry, static) = { + .path = "show ip pmtu", + .function = show_ip_pmtu_command, + .short_help = "show ip path MTU", +}; + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/ip/ip_path_mtu.h b/src/vnet/ip/ip_path_mtu.h new file mode 100644 index 00000000000..2c54fcd7401 --- /dev/null +++ b/src/vnet/ip/ip_path_mtu.h @@ -0,0 +1,126 @@ +/* + *------------------------------------------------------------------ + * ip_path_mtu.h + * + * Copyright (c) 2021 Graphiant. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#include <vnet/ip/ip.h> + +/** + * @brief + * The Path MTU DPO. interposed in the forwarding chain of the host prefix. + */ +typedef struct ip_pmtu_dpo_t_ +{ + /** + * The protocol of packets using this DPO + */ + dpo_proto_t ipm_proto; + + u8 __pad8; + + /** + * Configured Path Mtu + */ + u16 ipm_pmtu; + + /** + * number of locks. + */ + u16 ipm_locks; + + /** + * Stacked DPO + */ + dpo_id_t ipm_dpo; +} ip_pmtu_dpo_t; + +/* + * PMTU DPOs are accessed in the data-path so they should not straddle a cache + * line. Align to a integer factor of a cacheline + */ +STATIC_ASSERT_SIZEOF (ip_pmtu_dpo_t, 2 * sizeof (u64)); + +#define foreach_ip_pmtu_flag \ + _ (ATTACHED, 0, "attached") \ + _ (REMOTE, 1, "remote") \ + _ (STALE, 2, "stale") + +typedef enum ip_pmtu_flags_t_ +{ +#define _(a, b, c) IP_PMTU_FLAG_##a = (1 << b), + foreach_ip_pmtu_flag +#undef _ +} ip_pmtu_flags_t; + +/** + * Remote Path MTU tracking object + */ +typedef struct ip_pmtu_t_ +{ + /** linkage into the FIB graph */ + fib_node_t ipt_node; + + /** Track fib entry */ + fib_node_index_t ipt_fib_entry; + u32 ipt_sibling; + ip_pmtu_flags_t ipt_flags; + + /** Configured MTU */ + u16 ipt_cfg_pmtu; + + /** MTU from the parent MTU */ + u16 ipt_parent_pmtu; + + /** operational MTU; the minimum value of the cfg and parent MTU */ + u16 ipt_oper_pmtu; +} ip_pmtu_t; + +extern int ip_path_mtu_update (const ip_address_t *nh, u32 table_id, u16 pmtu); + +typedef walk_rc_t (*ip_path_mtu_walk_t) (index_t ipti, void *ctx); + +extern void ip_path_mtu_walk (ip_path_mtu_walk_t fn, void *ctx); +extern int ip_path_mtu_replace_begin (void); +extern int ip_path_mtu_replace_end (void); + +extern u32 ip_pmtu_get_table_id (const ip_pmtu_t *ipt); +extern void ip_pmtu_get_ip (const ip_pmtu_t *ipt, ip_address_t *ip); + +/** + * Data-plane accessor functions + */ +extern ip_pmtu_dpo_t *ip_pmtu_dpo_pool; +static_always_inline ip_pmtu_dpo_t * +ip_pmtu_dpo_get (index_t index) +{ + return (pool_elt_at_index (ip_pmtu_dpo_pool, index)); +} + +extern ip_pmtu_t *ip_pmtu_pool; +static_always_inline ip_pmtu_t * +ip_path_mtu_get (index_t index) +{ + return (pool_elt_at_index (ip_pmtu_pool, index)); +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/ip/ip_path_mtu_node.c b/src/vnet/ip/ip_path_mtu_node.c new file mode 100644 index 00000000000..b13f9de849c --- /dev/null +++ b/src/vnet/ip/ip_path_mtu_node.c @@ -0,0 +1,206 @@ +/* + *------------------------------------------------------------------ + * ip_path_mtu.c + * + * Copyright (c) 2020 Graphiant. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#include <vnet/ip/ip_path_mtu.h> +#include <vnet/ip/ip_frag.h> + +typedef enum +{ + IP_PMTU_DROP, + IP_PMTU_N_NEXT, +} ip_pmtu_next_t; + +typedef struct ip_pmtu_trace_t_ +{ + u16 pmtu; + u16 packet_size; +} ip_pmtu_trace_t; + +static u8 * +format_ip_pmtu_trace (u8 *s, va_list *args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + ip_pmtu_trace_t *t = va_arg (*args, ip_pmtu_trace_t *); + + s = format (s, "path mtu:%d packet size:%d", t->pmtu, t->packet_size); + + return s; +} + +static inline uword +ip_pmtu_dpo_inline (vlib_main_t *vm, vlib_node_runtime_t *node, + vlib_frame_t *frame, ip_address_family_t af) +{ + u32 n_left_from, *from, next_index, *to_next, n_left_to_next; + u32 frag_sent = 0, small_packets = 0; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + u32 *buffer = 0; + + while (n_left_from > 0) + { + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + const ip_pmtu_dpo_t *ipm0; + u32 pi0, *frag_from, frag_left; + vlib_buffer_t *p0; + ip_frag_error_t error0; + u16 next0; + + /* + * Note: The packet is not enqueued now. It is instead put + * in a vector where other fragments will be put as well. + */ + pi0 = from[0]; + from += 1; + n_left_from -= 1; + + p0 = vlib_get_buffer (vm, pi0); + ipm0 = ip_pmtu_dpo_get (vnet_buffer (p0)->ip.adj_index[VLIB_TX]); + vnet_buffer (p0)->ip.adj_index[VLIB_TX] = ipm0->ipm_dpo.dpoi_index; + next0 = ipm0->ipm_dpo.dpoi_next_node; + + if (PREDICT_FALSE (p0->flags & VLIB_BUFFER_IS_TRACED)) + { + ip_pmtu_trace_t *t; + t = vlib_add_trace (vm, node, p0, sizeof (*t)); + t->pmtu = ipm0->ipm_pmtu; + t->packet_size = vlib_buffer_length_in_chain (vm, p0); + } + + if (AF_IP6 == af) + error0 = + ip6_frag_do_fragment (vm, pi0, ipm0->ipm_pmtu, 0, &buffer); + else + error0 = + ip4_frag_do_fragment (vm, pi0, ipm0->ipm_pmtu, 0, &buffer); + + if (AF_IP4 == af && error0 == IP_FRAG_ERROR_DONT_FRAGMENT_SET) + { + icmp4_error_set_vnet_buffer ( + p0, ICMP4_destination_unreachable, + ICMP4_destination_unreachable_fragmentation_needed_and_dont_fragment_set, + ipm0->ipm_pmtu); + next0 = IP_FRAG_NEXT_ICMP_ERROR; + } + else + { + next0 = + (error0 == IP_FRAG_ERROR_NONE ? next0 : IP_FRAG_NEXT_DROP); + } + + if (error0 == IP_FRAG_ERROR_NONE) + { + /* Free original buffer chain */ + frag_sent += vec_len (buffer); + small_packets += (vec_len (buffer) == 1); + vlib_buffer_free_one (vm, pi0); /* Free original packet */ + } + else + { + vlib_error_count (vm, node->node_index, error0, 1); + vec_add1 (buffer, pi0); /* Get rid of the original buffer */ + } + + /* Send fragments that were added in the frame */ + frag_from = buffer; + frag_left = vec_len (buffer); + + while (frag_left > 0) + { + while (frag_left > 0 && n_left_to_next > 0) + { + u32 i; + i = to_next[0] = frag_from[0]; + frag_from += 1; + frag_left -= 1; + to_next += 1; + n_left_to_next -= 1; + + vlib_get_buffer (vm, i)->error = node->errors[error0]; + vlib_validate_buffer_enqueue_x1 ( + vm, node, next_index, to_next, n_left_to_next, i, next0); + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + vlib_get_next_frame (vm, node, next_index, to_next, + n_left_to_next); + } + vec_reset_length (buffer); + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + vec_free (buffer); + + return frame->n_vectors; +} + +// clang-format off + +VLIB_NODE_FN (ip4_ip_pmtu_dpo_node) (vlib_main_t *vm, + vlib_node_runtime_t *node, + vlib_frame_t *from_frame) +{ + return (ip_pmtu_dpo_inline (vm, node, from_frame, 0)); +} + +VLIB_NODE_FN (ip6_ip_pmtu_dpo_node) (vlib_main_t *vm, + vlib_node_runtime_t *node, + vlib_frame_t *from_frame) +{ + return (ip_pmtu_dpo_inline (vm, node, from_frame, 1)); +} + +VLIB_REGISTER_NODE (ip4_ip_pmtu_dpo_node) = { + .name = "ip4-pmtu-dpo", + .vector_size = sizeof (u32), + .format_trace = format_ip_pmtu_trace, + .n_errors = 0, + .n_next_nodes = IP_PMTU_N_NEXT, + .next_nodes = + { + [IP_PMTU_DROP] = "ip4-drop", + } +}; +VLIB_REGISTER_NODE (ip6_ip_pmtu_dpo_node) = { + .name = "ip6-pmtu-dpo", + .vector_size = sizeof (u32), + .format_trace = format_ip_pmtu_trace, + .n_errors = 0, + .n_next_nodes = IP_PMTU_N_NEXT, + .next_nodes = + { + [IP_PMTU_DROP] = "ip6-drop", + } +}; + +// clang-format on + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ |