aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorNeale Ranns <neale@graphiant.com>2020-12-21 08:29:34 +0000
committerOle Tr�an <otroan@employees.org>2021-02-15 17:27:48 +0000
commit8f5fef2c78b95de1a636ce27111722b71702212a (patch)
treea0ebd0189969ccae1f0bdd7c1a9c18dd7a066f2e /src
parent54be0cc044f445853fae7b8995c477605250af16 (diff)
ip: Path MTU
Type: feature Support setting the MTU for a peer on an interface. The minimum value of the path and interface MTU is used at forwarding time. the path MTU is specified for a given peer, by address and table-ID. In the forwarding plane the MTU is enfored either: 1 - if the peer is attached, then the MTU is set on the peer's adjacency 2 - if the peer is not attached, it is remote, then a DPO is added to the peer's FIB entry to perform the necessary fragmentation. Signed-off-by: Neale Ranns <neale@graphiant.com> Change-Id: I8b9ea6a07868b50e97e2561f18d9335407dea7ae
Diffstat (limited to 'src')
-rw-r--r--src/plugins/unittest/fib_test.c30
-rw-r--r--src/vnet/CMakeLists.txt3
-rw-r--r--src/vnet/adj/adj.c24
-rw-r--r--src/vnet/adj/adj.h3
-rw-r--r--src/vnet/adj/adj_glean.c1
-rw-r--r--src/vnet/adj/adj_internal.h1
-rw-r--r--src/vnet/adj/adj_mcast.c2
-rw-r--r--src/vnet/adj/adj_midchain.c1
-rw-r--r--src/vnet/adj/adj_nbr.c25
-rw-r--r--src/vnet/adj/adj_nbr.h7
-rw-r--r--src/vnet/dpo/dpo.c16
-rw-r--r--src/vnet/dpo/dpo.h23
-rw-r--r--src/vnet/dpo/load_balance.c23
-rw-r--r--src/vnet/dpo/mpls_label_dpo.c17
-rw-r--r--src/vnet/fib/fib_entry.c20
-rw-r--r--src/vnet/fib/fib_node.h6
-rw-r--r--src/vnet/fib/fib_path.c1
-rw-r--r--src/vnet/gre/gre.c9
-rw-r--r--src/vnet/ip/ip.api57
-rw-r--r--src/vnet/ip/ip_api.c92
-rw-r--r--src/vnet/ip/ip_path_mtu.c883
-rw-r--r--src/vnet/ip/ip_path_mtu.h126
-rw-r--r--src/vnet/ip/ip_path_mtu_node.c206
23 files changed, 1554 insertions, 22 deletions
diff --git a/src/plugins/unittest/fib_test.c b/src/plugins/unittest/fib_test.c
index b9b77ba479f..c6291fb2658 100644
--- a/src/plugins/unittest/fib_test.c
+++ b/src/plugins/unittest/fib_test.c
@@ -5869,10 +5869,14 @@ fib_test_ae (void)
static int
fib_test_pref (void)
{
- test_main_t *tm = &test_main;
- int res;
+ test_main_t *tm;
+ ip4_main_t *im;
+ int res, i;
+ tm = &test_main;
+ im = &ip4_main;
res = 0;
+
const fib_prefix_t pfx_1_1_1_1_s_32 = {
.fp_len = 32,
.fp_proto = FIB_PROTOCOL_IP4,
@@ -5883,6 +5887,11 @@ fib_test_pref (void)
},
};
+ vec_validate(im->fib_index_by_sw_if_index, tm->hw[2]->sw_if_index);
+
+ for (i = 0; i <= 2; i++)
+ im->fib_index_by_sw_if_index[tm->hw[i]->sw_if_index] = 0;
+
/*
* 2 high, 2 medium and 2 low preference non-recursive paths
*/
@@ -9049,12 +9058,25 @@ static int
fib_test_inherit (void)
{
fib_node_index_t fei;
+ int n_feis, res, i;
test_main_t *tm;
- int n_feis, res;
+ ip4_main_t *im4;
+ ip6_main_t *im6;
+ tm = &test_main;
+ im4 = &ip4_main;
+ im6 = &ip6_main;
res = 0;
+
+ vec_validate(im4->fib_index_by_sw_if_index, tm->hw[2]->sw_if_index);
+ vec_validate(im6->fib_index_by_sw_if_index, tm->hw[2]->sw_if_index);
+
+ for (i = 0; i <= 2; i++)
+ {
+ im4->fib_index_by_sw_if_index[tm->hw[i]->sw_if_index] = 0;
+ im6->fib_index_by_sw_if_index[tm->hw[i]->sw_if_index] = 0;
+ }
n_feis = fib_entry_pool_size();
- tm = &test_main;
const ip46_address_t nh_10_10_10_1 = {
.ip4.as_u32 = clib_host_to_net_u32(0x0a0a0a01),
diff --git a/src/vnet/CMakeLists.txt b/src/vnet/CMakeLists.txt
index 0e1d9c44b05..38aeda5be93 100644
--- a/src/vnet/CMakeLists.txt
+++ b/src/vnet/CMakeLists.txt
@@ -415,6 +415,8 @@ list(APPEND VNET_SOURCES
ip/ip_interface.c
ip/ip_init.c
ip/ip_in_out_acl.c
+ ip/ip_path_mtu.c
+ ip/ip_path_mtu_node.c
ip/ip_punt_drop.c
ip/ip_types.c
ip/lookup.c
@@ -437,6 +439,7 @@ list(APPEND VNET_MULTIARCH_SOURCES
ip/ip6_punt_drop.c
ip/punt_node.c
ip/ip_in_out_acl.c
+ ip/ip_path_mtu_node.c
)
list(APPEND VNET_HEADERS
diff --git a/src/vnet/adj/adj.c b/src/vnet/adj/adj.c
index d3890223dc1..8808294f7a6 100644
--- a/src/vnet/adj/adj.c
+++ b/src/vnet/adj/adj.c
@@ -20,6 +20,7 @@
#include <vnet/adj/adj_mcast.h>
#include <vnet/adj/adj_delegate.h>
#include <vnet/fib/fib_node_list.h>
+#include <vnet/fib/fib_walk.h>
/* Adjacency packet/byte counters indexed by adjacency index. */
vlib_combined_counter_main_t adjacency_counters = {
@@ -326,6 +327,16 @@ adj_dpo_get_urpf (const dpo_id_t *dpo)
return (adj->rewrite_header.sw_if_index);
}
+u16
+adj_dpo_get_mtu (const dpo_id_t *dpo)
+{
+ ip_adjacency_t *adj;
+
+ adj = adj_get(dpo->dpoi_index);
+
+ return (adj->rewrite_header.max_l3_packet_bytes);
+}
+
void
adj_lock (adj_index_t adj_index)
{
@@ -465,6 +476,19 @@ adj_mtu_update_walk_cb (adj_index_t ai,
vnet_rewrite_update_mtu (vnet_get_main(), adj->ia_link,
&adj->rewrite_header);
+ adj_delegate_adj_modified(adj);
+
+ /**
+ * Backwalk to all Path MTU trackers, casual like ..
+ */
+ {
+ fib_node_back_walk_ctx_t bw_ctx = {
+ .fnbw_reason = FIB_NODE_BW_REASON_FLAG_ADJ_MTU,
+ };
+
+ fib_walk_async(FIB_NODE_TYPE_ADJ, ai,
+ FIB_WALK_PRIORITY_LOW, &bw_ctx);
+ }
return (ADJ_WALK_RC_CONTINUE);
}
diff --git a/src/vnet/adj/adj.h b/src/vnet/adj/adj.h
index 44bb2bd981b..c1922c755ec 100644
--- a/src/vnet/adj/adj.h
+++ b/src/vnet/adj/adj.h
@@ -373,6 +373,9 @@ STATIC_ASSERT ((STRUCT_OFFSET_OF (ip_adjacency_t, cacheline3) ==
/* An adj fits into 4 cachelines on your average machine */
STATIC_ASSERT_SIZEOF (ip_adjacency_t, 4 * 64);
#endif
+STATIC_ASSERT ((STRUCT_OFFSET_OF (ip_adjacency_t, sub_type.nbr.next_hop) ==
+ STRUCT_OFFSET_OF (ip_adjacency_t, sub_type.midchain.next_hop)),
+ "IP adjacency nbr and midchain offsets don't match");
/**
* @brief
diff --git a/src/vnet/adj/adj_glean.c b/src/vnet/adj/adj_glean.c
index c52e3d09693..e956318a1ff 100644
--- a/src/vnet/adj/adj_glean.c
+++ b/src/vnet/adj/adj_glean.c
@@ -467,6 +467,7 @@ const static dpo_vft_t adj_glean_dpo_vft = {
.dv_unlock = adj_dpo_unlock,
.dv_format = format_adj_glean,
.dv_get_urpf = adj_dpo_get_urpf,
+ .dv_get_mtu = adj_dpo_get_mtu,
};
/**
diff --git a/src/vnet/adj/adj_internal.h b/src/vnet/adj/adj_internal.h
index 6639d32267f..253c1e982c1 100644
--- a/src/vnet/adj/adj_internal.h
+++ b/src/vnet/adj/adj_internal.h
@@ -126,6 +126,7 @@ extern void adj_mcast_remove(fib_protocol_t proto,
extern void adj_midchain_teardown(ip_adjacency_t *adj);
extern u32 adj_dpo_get_urpf(const dpo_id_t *dpo);
+extern u16 adj_dpo_get_mtu(const dpo_id_t *dpo);
/*
* Adj BFD
diff --git a/src/vnet/adj/adj_mcast.c b/src/vnet/adj/adj_mcast.c
index 590652244e6..a20f61f6f6b 100644
--- a/src/vnet/adj/adj_mcast.c
+++ b/src/vnet/adj/adj_mcast.c
@@ -388,12 +388,14 @@ const static dpo_vft_t adj_mcast_dpo_vft = {
.dv_unlock = adj_dpo_unlock,
.dv_format = format_adj_mcast,
.dv_get_urpf = adj_dpo_get_urpf,
+ .dv_get_mtu = adj_dpo_get_mtu,
};
const static dpo_vft_t adj_mcast_midchain_dpo_vft = {
.dv_lock = adj_dpo_lock,
.dv_unlock = adj_dpo_unlock,
.dv_format = format_adj_mcast_midchain,
.dv_get_urpf = adj_dpo_get_urpf,
+ .dv_get_mtu = adj_dpo_get_mtu,
};
/**
diff --git a/src/vnet/adj/adj_midchain.c b/src/vnet/adj/adj_midchain.c
index a21cd21ea25..3d879e9d7fc 100644
--- a/src/vnet/adj/adj_midchain.c
+++ b/src/vnet/adj/adj_midchain.c
@@ -744,6 +744,7 @@ const static dpo_vft_t adj_midchain_dpo_vft = {
.dv_unlock = adj_dpo_unlock,
.dv_format = format_adj_midchain,
.dv_get_urpf = adj_dpo_get_urpf,
+ .dv_get_mtu = adj_dpo_get_mtu,
};
/**
diff --git a/src/vnet/adj/adj_nbr.c b/src/vnet/adj/adj_nbr.c
index 921588a7ef7..811d0b8faa2 100644
--- a/src/vnet/adj/adj_nbr.c
+++ b/src/vnet/adj/adj_nbr.c
@@ -222,6 +222,27 @@ adj_nbr_alloc (fib_protocol_t nh_proto,
return (adj);
}
+void
+adj_nbr_set_mtu (adj_index_t adj_index, u16 mtu)
+{
+ ip_adjacency_t *adj;
+
+ ASSERT(ADJ_INDEX_INVALID != adj_index);
+
+ adj = adj_get(adj_index);
+
+ if (0 == mtu)
+ vnet_rewrite_update_mtu(vnet_get_main(), adj->ia_link,
+ &adj->rewrite_header);
+ else
+ {
+ vnet_rewrite_update_mtu(vnet_get_main(), adj->ia_link,
+ &adj->rewrite_header);
+ adj->rewrite_header.max_l3_packet_bytes =
+ clib_min (adj->rewrite_header.max_l3_packet_bytes, mtu);
+ }
+}
+
/*
* adj_nbr_add_or_lock
*
@@ -268,13 +289,13 @@ adj_nbr_add_or_lock (fib_protocol_t nh_proto,
* So ask the interface to do it.
*/
vnet_update_adjacency_for_sw_interface(vnm, sw_if_index, adj_index);
+ adj_delegate_adj_created(adj_get(adj_index));
}
else
{
adj_lock(adj_index);
}
- adj_delegate_adj_created(adj_get(adj_index));
return (adj_index);
}
@@ -1055,12 +1076,14 @@ const static dpo_vft_t adj_nbr_dpo_vft = {
.dv_format = format_adj_nbr,
.dv_mem_show = adj_mem_show,
.dv_get_urpf = adj_dpo_get_urpf,
+ .dv_get_mtu = adj_dpo_get_mtu,
};
const static dpo_vft_t adj_nbr_incompl_dpo_vft = {
.dv_lock = adj_dpo_lock,
.dv_unlock = adj_dpo_unlock,
.dv_format = format_adj_nbr_incomplete,
.dv_get_urpf = adj_dpo_get_urpf,
+ .dv_get_mtu = adj_dpo_get_mtu,
};
/**
diff --git a/src/vnet/adj/adj_nbr.h b/src/vnet/adj/adj_nbr.h
index 3a89dc89a22..4874e73a45c 100644
--- a/src/vnet/adj/adj_nbr.h
+++ b/src/vnet/adj/adj_nbr.h
@@ -75,6 +75,13 @@ extern adj_index_t adj_nbr_add_or_lock_w_rewrite(fib_protocol_t nh_proto,
const ip46_address_t *nh_addr,
u32 sw_if_index,
u8 *rewrite);
+
+/**
+ * Set the MTU on an adjacency
+ *
+ */
+extern void adj_nbr_set_mtu(adj_index_t ai, u16 mtu);
+
/**
* @brief When adding a rewrite to an adjacency these are flags that
* apply to that rewrite
diff --git a/src/vnet/dpo/dpo.c b/src/vnet/dpo/dpo.c
index 1331b5501bc..d8342ff17ae 100644
--- a/src/vnet/dpo/dpo.c
+++ b/src/vnet/dpo/dpo.c
@@ -23,6 +23,8 @@
* The VLIB graph nodes are graph of types, the DPO graph is a graph of instances.
*/
+// clang-format off
+
#include <vnet/dpo/dpo.h>
#include <vnet/ip/lookup.h>
#include <vnet/ip/format.h>
@@ -395,6 +397,18 @@ dpo_get_urpf(const dpo_id_t *dpo)
return (~0);
}
+u16
+dpo_get_mtu(const dpo_id_t *dpo)
+{
+ if (dpo_id_is_valid(dpo) &&
+ (NULL != dpo_vfts[dpo->dpoi_type].dv_get_mtu))
+ {
+ return (dpo_vfts[dpo->dpoi_type].dv_get_mtu(dpo));
+ }
+
+ return (0xffff);
+}
+
static u32
dpo_get_next_node (dpo_type_t child_type,
dpo_proto_t child_proto,
@@ -649,3 +663,5 @@ VLIB_CLI_COMMAND (show_fib_memory, static) = {
.short_help = "show dpo memory",
};
/* *INDENT-ON* */
+
+// clang-format on
diff --git a/src/vnet/dpo/dpo.h b/src/vnet/dpo/dpo.h
index ee4990d0058..e9976c2dd87 100644
--- a/src/vnet/dpo/dpo.h
+++ b/src/vnet/dpo/dpo.h
@@ -24,6 +24,8 @@
* instances.
*/
+// clang-format off
+
#ifndef __DPO_H__
#define __DPO_H__
@@ -362,6 +364,16 @@ extern void dpo_stack_from_node(u32 child_node,
extern u32 dpo_get_urpf(const dpo_id_t *dpo);
/**
+ * Get the MTU DPO
+ *
+ * @param dpo
+ * The DPO from which to get the MTU
+ *
+ * @return MTU (0xffff if something more usefull was unavailable)
+ */
+extern u16 dpo_get_mtu(const dpo_id_t *dpo);
+
+/**
* @brief A lock function registered for a DPO type
*/
typedef void (*dpo_lock_fn_t)(dpo_id_t *dpo);
@@ -389,6 +401,11 @@ typedef u32* (*dpo_get_next_node_t)(const dpo_id_t *dpo);
typedef u32 (*dpo_get_urpf_t)(const dpo_id_t *dpo);
/**
+ * @brief Given a DPO instance return the MTU
+ */
+typedef u16 (*dpo_get_mtu_t)(const dpo_id_t *dpo);
+
+/**
* @brief Called during FIB interposition when the originally
* registered DPO is used to 'clone' an instance for interposition
* at a particular location in the FIB graph.
@@ -433,6 +450,10 @@ typedef struct dpo_vft_t_
*/
dpo_get_urpf_t dv_get_urpf;
/**
+ * Get MTU
+ */
+ dpo_get_mtu_t dv_get_mtu;
+ /**
* Signal on an interposed child that the parent has changed
*/
dpo_mk_interpose_t dv_mk_interpose;
@@ -548,3 +569,5 @@ do { \
if ((YESNO)) vlib_worker_thread_barrier_release((VM));
#endif
+
+// clang-format on
diff --git a/src/vnet/dpo/load_balance.c b/src/vnet/dpo/load_balance.c
index fb876a09ec2..a212532dffd 100644
--- a/src/vnet/dpo/load_balance.c
+++ b/src/vnet/dpo/load_balance.c
@@ -25,6 +25,8 @@
#include <vnet/ip/ip4_inlines.h>
#include <vnet/ip/ip6_inlines.h>
+// clang-format off
+
/*
* distribution error tolerance for load-balancing
*/
@@ -918,11 +920,30 @@ load_balance_mem_show (void)
load_balance_map_show_mem();
}
+static u16
+load_balance_dpo_get_mtu (const dpo_id_t *dpo)
+{
+ const dpo_id_t *buckets;
+ load_balance_t *lb;
+ u16 i, mtu = 0xffff;
+
+ lb = load_balance_get(dpo->dpoi_index);
+ buckets = load_balance_get_buckets(lb);
+
+ for (i = 0; i < lb->lb_n_buckets; i++)
+ {
+ mtu = clib_min (mtu, dpo_get_mtu (&buckets[i]));
+ }
+
+ return (mtu);
+}
+
const static dpo_vft_t lb_vft = {
.dv_lock = load_balance_lock,
.dv_unlock = load_balance_unlock,
.dv_format = format_load_balance_dpo,
.dv_mem_show = load_balance_mem_show,
+ .dv_get_mtu = load_balance_dpo_get_mtu,
};
/**
@@ -1323,3 +1344,5 @@ VLIB_REGISTER_NODE (bier_load_balance_node) = {
.format_trace = format_bier_load_balance_trace,
.sibling_of = "mpls-load-balance",
};
+
+// clang-format on
diff --git a/src/vnet/dpo/mpls_label_dpo.c b/src/vnet/dpo/mpls_label_dpo.c
index 683b5449513..b87cb1efcd2 100644
--- a/src/vnet/dpo/mpls_label_dpo.c
+++ b/src/vnet/dpo/mpls_label_dpo.c
@@ -18,6 +18,8 @@
#include <vnet/mpls/mpls.h>
#include <vnet/dpo/drop_dpo.h>
+// clang-format off
+
#ifndef CLIB_MARCH_VARIANT
/*
* pool of all MPLS Label DPOs
@@ -1213,12 +1215,25 @@ mpls_label_interpose (const dpo_id_t *original,
mpls_label_dpo_get_index(mld_clone));
}
+static u16
+mpls_label_dpo_get_mtu (const dpo_id_t *dpo)
+{
+ mpls_label_dpo_t *mld;
+
+ mld = mpls_label_dpo_get(dpo->dpoi_index);
+
+ /* return the parent's MTU minus the amount of header
+ * this DPO imposes */
+ return (dpo_get_mtu (&mld->mld_dpo) - sizeof(mpls_label_t) * mld->mld_n_labels);
+}
+
const static dpo_vft_t mld_vft = {
.dv_lock = mpls_label_dpo_lock,
.dv_unlock = mpls_label_dpo_unlock,
.dv_format = format_mpls_label_dpo,
.dv_mem_show = mpls_label_dpo_mem_show,
.dv_mk_interpose = mpls_label_interpose,
+ .dv_get_mtu = mpls_label_dpo_get_mtu,
};
const static char* const mpls_label_imp_pipe_ip4_nodes[] =
@@ -1337,3 +1352,5 @@ mpls_label_dpo_get_type (mpls_label_dpo_flags_t flags)
return (mpls_label_dpo_types[flags]);
}
#endif /* CLIB_MARCH_VARIANT */
+
+// clang-format on
diff --git a/src/vnet/fib/fib_entry.c b/src/vnet/fib/fib_entry.c
index 6edf31b47f3..119a7ac5e77 100644
--- a/src/vnet/fib/fib_entry.c
+++ b/src/vnet/fib/fib_entry.c
@@ -1362,7 +1362,7 @@ fib_entry_cover_updated (fib_node_index_t fib_entry_index)
if (0 == index)
{
/*
- * only the best source gets to set the back walk flags
+ * only the best source gets to set the install result
*/
res = fib_entry_src_action_cover_update(fib_entry, esrc);
bflags = fib_entry_src_get_flags(esrc);
@@ -1370,7 +1370,23 @@ fib_entry_cover_updated (fib_node_index_t fib_entry_index)
}
else
{
- fib_entry_src_action_cover_update(fib_entry, esrc);
+ /*
+ * contirubting sources can set backwalk flags
+ */
+ if (esrc->fes_flags & FIB_ENTRY_SRC_FLAG_CONTRIBUTING)
+ {
+ fib_entry_src_cover_res_t tmp = {
+ .install = !0,
+ .bw_reason = FIB_NODE_BW_REASON_FLAG_NONE,
+ };
+
+ tmp = fib_entry_src_action_cover_update(fib_entry, esrc);
+ res.bw_reason |= tmp.bw_reason;
+ }
+ else
+ {
+ fib_entry_src_action_cover_update(fib_entry, esrc);
+ }
}
index++;
}));
diff --git a/src/vnet/fib/fib_node.h b/src/vnet/fib/fib_node.h
index 5cf9182560f..27e67b11c87 100644
--- a/src/vnet/fib/fib_node.h
+++ b/src/vnet/fib/fib_node.h
@@ -119,6 +119,10 @@ typedef enum fib_node_back_walk_reason_t_ {
*/
FIB_NODE_BW_REASON_ADJ_UPDATE,
/**
+ * Walk update the adjacency MTU
+ */
+ FIB_NODE_BW_REASON_ADJ_MTU,
+ /**
* Walk to update children to inform them the adjacency is now down.
*/
FIB_NODE_BW_REASON_ADJ_DOWN,
@@ -135,6 +139,7 @@ typedef enum fib_node_back_walk_reason_t_ {
[FIB_NODE_BW_REASON_INTERFACE_DOWN] = "if-down", \
[FIB_NODE_BW_REASON_INTERFACE_DELETE] = "if-delete", \
[FIB_NODE_BW_REASON_ADJ_UPDATE] = "adj-update", \
+ [FIB_NODE_BW_REASON_ADJ_MTU] = "adj-mtu", \
[FIB_NODE_BW_REASON_ADJ_DOWN] = "adj-down", \
}
@@ -154,6 +159,7 @@ typedef enum fib_node_bw_reason_flag_t_ {
FIB_NODE_BW_REASON_FLAG_INTERFACE_DOWN = (1 << FIB_NODE_BW_REASON_INTERFACE_DOWN),
FIB_NODE_BW_REASON_FLAG_INTERFACE_DELETE = (1 << FIB_NODE_BW_REASON_INTERFACE_DELETE),
FIB_NODE_BW_REASON_FLAG_ADJ_UPDATE = (1 << FIB_NODE_BW_REASON_ADJ_UPDATE),
+ FIB_NODE_BW_REASON_FLAG_ADJ_MTU = (1 << FIB_NODE_BW_REASON_ADJ_MTU),
FIB_NODE_BW_REASON_FLAG_ADJ_DOWN = (1 << FIB_NODE_BW_REASON_ADJ_DOWN),
} __attribute__ ((packed)) fib_node_bw_reason_flag_t;
diff --git a/src/vnet/fib/fib_path.c b/src/vnet/fib/fib_path.c
index f48b64484cb..01140d5d0dc 100644
--- a/src/vnet/fib/fib_path.c
+++ b/src/vnet/fib/fib_path.c
@@ -999,6 +999,7 @@ fib_path_back_walk_notify (fib_node_t *node,
&path->fp_dpo);
}
if ((FIB_NODE_BW_REASON_FLAG_ADJ_UPDATE & ctx->fnbw_reason) ||
+ (FIB_NODE_BW_REASON_FLAG_ADJ_MTU & ctx->fnbw_reason) ||
(FIB_NODE_BW_REASON_FLAG_ADJ_DOWN & ctx->fnbw_reason))
{
/*
diff --git a/src/vnet/gre/gre.c b/src/vnet/gre/gre.c
index 0669c676bf5..fcdf9c0d6bc 100644
--- a/src/vnet/gre/gre.c
+++ b/src/vnet/gre/gre.c
@@ -495,8 +495,13 @@ mgre_update_adj (vnet_main_t * vnm, u32 sw_if_index, adj_index_t ai)
adj->ia_nh_proto, &adj->sub_type.nbr.next_hop);
if (NULL == ne)
- // no NHRP entry to provide the next-hop
- return;
+ {
+ // no TEIB entry to provide the next-hop
+ adj_nbr_midchain_update_rewrite (
+ ai, gre_get_fixup (t->tunnel_dst.fp_proto, adj_get_link_type (ai)),
+ uword_to_pointer (t->flags, void *), ADJ_FLAG_NONE, NULL);
+ return;
+ }
mgre_walk_ctx_t ctx = {
.t = t,
diff --git a/src/vnet/ip/ip.api b/src/vnet/ip/ip.api
index 3072e3e7c63..f49fc16bc1d 100644
--- a/src/vnet/ip/ip.api
+++ b/src/vnet/ip/ip.api
@@ -704,6 +704,63 @@ autoreply define ip_reassembly_enable_disable
vl_api_ip_reass_type_t type;
};
+/**
+ @brief Set a Path MTU value. i.e. a MTU value for a given neighbour.
+ The neighbour can be described as attached (w/ interface and next-hop)
+ or remote (w/ table_id and next-hop);
+ @param client_index - opaque cookie to identify the sender
+ @param context - sender context, to match reply w/ request
+ @param table_id - table-ID for next-hop
+ @param nh - Next hop
+ @param path_mtu - value to set, 0 is disable.
+*/
+typedef ip_path_mtu
+{
+ u32 client_index;
+ u32 context;
+ u32 table_id;
+ vl_api_address_t nh;
+ u16 path_mtu;
+};
+autoreply define ip_path_mtu_update
+{
+ u32 client_index;
+ u32 context;
+ vl_api_ip_path_mtu_t pmtu;
+};
+define ip_path_mtu_get
+{
+ u32 client_index;
+ u32 context;
+ u32 cursor;
+};
+define ip_path_mtu_get_reply
+{
+ u32 context;
+ i32 retval;
+ u32 cursor;
+};
+define ip_path_mtu_details
+{
+ u32 context;
+ vl_api_ip_path_mtu_t pmtu;
+};
+service {
+ rpc ip_path_mtu_get returns ip_path_mtu_get_reply
+ stream ip_path_mtu_details;
+};
+
+autoreply define ip_path_mtu_replace_begin
+{
+ u32 client_index;
+ u32 context;
+};
+autoreply define ip_path_mtu_replace_end
+{
+ u32 client_index;
+ u32 context;
+};
+
/*
* Local Variables:
* eval: (c-set-style "gnu")
diff --git a/src/vnet/ip/ip_api.c b/src/vnet/ip/ip_api.c
index 3bf404baadf..5b87f7cc86a 100644
--- a/src/vnet/ip/ip_api.c
+++ b/src/vnet/ip/ip_api.c
@@ -28,6 +28,7 @@
#include <vnet/ip/ip_types_api.h>
#include <vnet/ip/ip_punt_drop.h>
#include <vnet/ip/ip_types_api.h>
+#include <vnet/ip/ip_path_mtu.h>
#include <vnet/fib/fib_table.h>
#include <vnet/fib/fib_api.h>
#include <vnet/ethernet/arp_packet.h>
@@ -104,7 +105,11 @@
_ (IP_REASSEMBLY_SET, ip_reassembly_set) \
_ (IP_REASSEMBLY_GET, ip_reassembly_get) \
_ (IP_REASSEMBLY_ENABLE_DISABLE, ip_reassembly_enable_disable) \
- _ (IP_PUNT_REDIRECT_DUMP, ip_punt_redirect_dump)
+ _ (IP_PUNT_REDIRECT_DUMP, ip_punt_redirect_dump) \
+ _ (IP_PATH_MTU_UPDATE, ip_path_mtu_update) \
+ _ (IP_PATH_MTU_REPLACE_BEGIN, ip_path_mtu_replace_begin) \
+ _ (IP_PATH_MTU_REPLACE_END, ip_path_mtu_replace_end) \
+ _ (IP_PATH_MTU_GET, ip_path_mtu_get)
static void
vl_api_sw_interface_ip6_enable_disable_t_handler
@@ -1134,18 +1139,18 @@ static void
REPLY_MACRO (VL_API_IP_CONTAINER_PROXY_ADD_DEL_REPLY);
}
-typedef struct ip_container_proxy_walk_ctx_t_
+typedef struct ip_walk_ctx_t_
{
vl_api_registration_t *reg;
u32 context;
-} ip_container_proxy_walk_ctx_t;
+} ip_walk_ctx_t;
static int
ip_container_proxy_send_details (const fib_prefix_t * pfx, u32 sw_if_index,
void *args)
{
vl_api_ip_container_proxy_details_t *mp;
- ip_container_proxy_walk_ctx_t *ctx = args;
+ ip_walk_ctx_t *ctx = args;
mp = vl_msg_api_alloc (sizeof (*mp));
if (!mp)
@@ -1173,7 +1178,7 @@ vl_api_ip_container_proxy_dump_t_handler (vl_api_ip_container_proxy_dump_t *
if (!reg)
return;
- ip_container_proxy_walk_ctx_t ctx = {
+ ip_walk_ctx_t ctx = {
.context = mp->context,
.reg = reg,
};
@@ -1624,21 +1629,15 @@ void
REPLY_MACRO (VL_API_IP_REASSEMBLY_ENABLE_DISABLE_REPLY);
}
-typedef struct ip_punt_redirect_walk_ctx_t_
-{
- vl_api_registration_t *reg;
- u32 context;
-} ip_punt_redirect_walk_ctx_t;
-
static walk_rc_t
send_ip_punt_redirect_details (u32 rx_sw_if_index,
const ip_punt_redirect_rx_t * ipr, void *arg)
{
- ip_punt_redirect_walk_ctx_t *ctx = arg;
vl_api_ip_punt_redirect_details_t *mp;
fib_path_encode_ctx_t path_ctx = {
.rpaths = NULL,
};
+ ip_walk_ctx_t *ctx = arg;
mp = vl_msg_api_alloc (sizeof (*mp));
if (!mp)
@@ -1676,7 +1675,7 @@ vl_api_ip_punt_redirect_dump_t_handler (vl_api_ip_punt_redirect_dump_t * mp)
if (mp->is_ipv6 == 1)
fproto = FIB_PROTOCOL_IP6;
- ip_punt_redirect_walk_ctx_t ctx = {
+ ip_walk_ctx_t ctx = {
.reg = reg,
.context = mp->context,
};
@@ -1699,6 +1698,73 @@ vl_api_ip_punt_redirect_dump_t_handler (vl_api_ip_punt_redirect_dump_t * mp)
ip_punt_redirect_walk (fproto, send_ip_punt_redirect_details, &ctx);
}
+void
+vl_api_ip_path_mtu_update_t_handler (vl_api_ip_path_mtu_update_t *mp)
+{
+ vl_api_ip_path_mtu_update_reply_t *rmp;
+ ip_address_t nh;
+ int rv = 0;
+
+ ip_address_decode2 (&mp->pmtu.nh, &nh);
+
+ rv = ip_path_mtu_update (&nh, ntohl (mp->pmtu.table_id),
+ ntohs (mp->pmtu.path_mtu));
+
+ REPLY_MACRO (VL_API_IP_PATH_MTU_UPDATE_REPLY);
+}
+
+void
+vl_api_ip_path_mtu_replace_begin_t_handler (
+ vl_api_ip_path_mtu_replace_begin_t *mp)
+{
+ vl_api_ip_path_mtu_replace_begin_reply_t *rmp;
+ int rv;
+
+ rv = ip_path_mtu_replace_begin ();
+
+ REPLY_MACRO (VL_API_IP_PATH_MTU_REPLACE_BEGIN_REPLY);
+}
+
+void
+vl_api_ip_path_mtu_replace_end_t_handler (vl_api_ip_path_mtu_replace_end_t *mp)
+{
+ vl_api_ip_path_mtu_replace_end_reply_t *rmp;
+ int rv;
+
+ rv = ip_path_mtu_replace_end ();
+
+ REPLY_MACRO (VL_API_IP_PATH_MTU_REPLACE_END_REPLY);
+}
+
+static void
+send_ip_path_mtu_details (index_t ipti, vl_api_registration_t *rp, u32 context)
+{
+ vl_api_ip_path_mtu_details_t *rmp;
+ ip_address_t ip;
+ ip_pmtu_t *ipt;
+
+ ipt = ip_path_mtu_get (ipti);
+
+ REPLY_MACRO_DETAILS4 (VL_API_IP_PATH_MTU_DETAILS, rp, context, ({
+ ip_pmtu_get_ip (ipt, &ip);
+ ip_address_encode2 (&ip, &rmp->pmtu.nh);
+ rmp->pmtu.table_id =
+ htonl (ip_pmtu_get_table_id (ipt));
+ rmp->pmtu.path_mtu = htons (ipt->ipt_cfg_pmtu);
+ }));
+}
+
+static void
+vl_api_ip_path_mtu_get_t_handler (vl_api_ip_path_mtu_get_t *mp)
+{
+ vl_api_ip_path_mtu_get_reply_t *rmp;
+ i32 rv = 0;
+
+ REPLY_AND_DETAILS_MACRO (
+ VL_API_IP_PATH_MTU_GET_REPLY, ip_pmtu_pool,
+ ({ send_ip_path_mtu_details (cursor, rp, mp->context); }));
+}
+
#define vl_msg_name_crc_list
#include <vnet/ip/ip.api.h>
#undef vl_msg_name_crc_list
diff --git a/src/vnet/ip/ip_path_mtu.c b/src/vnet/ip/ip_path_mtu.c
new file mode 100644
index 00000000000..38adb44065b
--- /dev/null
+++ b/src/vnet/ip/ip_path_mtu.c
@@ -0,0 +1,883 @@
+/*
+ *------------------------------------------------------------------
+ * ip_path_mtu.c
+ *
+ * Copyright (c) 2021 Graphiant.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <vnet/ip/ip_path_mtu.h>
+#include <vnet/ip/ip_frag.h>
+#include <vnet/adj/adj_delegate.h>
+#include <vnet/adj/adj_nbr.h>
+#include <vnet/fib/fib_table.h>
+#include <vnet/fib/fib_entry_track.h>
+
+#include <vnet/dpo/drop_dpo.h>
+
+/**
+ * Path MTU
+ *
+ * A path is a peer. A peer is known by an IP address (in a table).
+ * Insert a DPO in the forwarding chain for the peer to perform the
+ * fragmentation.
+ * For attached peers, all traffic will use the peer's adjacency, there
+ * is already an MTU chekc in the adjacency (for the link's MTU) so as an
+ * optimisation, instead of using a DPO, we add a delegate to the adjacency
+ * to set the adjacency's MTU to the path MTU.
+ */
+
+/**
+ * the logger
+ */
+static vlib_log_class_t ip_pmtu_logger;
+
+static adj_delegate_type_t ip_pmtu_adj_delegate_type;
+static fib_source_t ip_pmtu_source;
+
+/**
+ * DPO pool
+ */
+ip_pmtu_dpo_t *ip_pmtu_dpo_pool;
+
+/**
+ * DPO type registered for these GBP FWD
+ */
+static dpo_type_t ip_pmtu_dpo_type;
+
+/**
+ * Fib node type for the tracker
+ */
+static fib_node_type_t ip_pmtu_fib_type;
+
+/**
+ * Path MTU tracker pool
+ */
+ip_pmtu_t *ip_pmtu_pool;
+
+/**
+ * Delegate added to adjacencies to track path MTU
+ */
+typedef struct ip_path_mtu_adj_delegate_t_
+{
+ u16 pmtu;
+} ip_path_mtu_adj_delegate_t;
+
+static ip_path_mtu_adj_delegate_t *ip_path_mtu_adj_delegate_pool;
+
+/* DB of all FIB PMTU settings */
+typedef struct ip_pmtu_key_t_
+{
+ ip46_address_t nh;
+ u32 table_id;
+ fib_protocol_t fproto;
+} __clib_packed ip_pmtu_key_t;
+
+static uword *ip_pmtu_db;
+
+#define IP_PMTU_TRKR_DBG(_ipt, _fmt, _args...) \
+ { \
+ vlib_log_debug (ip_pmtu_logger, "[%U]: " _fmt ": ", format_ip_pmtu, \
+ _ipt - ip_pmtu_pool, ##_args); \
+ }
+#define IP_PMTU_DBG(_fmt, _args...) \
+ { \
+ vlib_log_debug (ip_pmtu_logger, _fmt ": ", ##_args); \
+ }
+
+static u8 *
+format_ip_pmtu_flags (u8 *s, va_list *ap)
+{
+ ip_pmtu_flags_t f = va_arg (*ap, ip_pmtu_flags_t);
+
+ if (0)
+ ;
+#define _(a, b, c) else if (f & IP_PMTU_FLAG_##a) s = format (s, "%s ", c);
+ foreach_ip_pmtu_flag
+#undef _
+
+ return (s);
+}
+
+u32
+ip_pmtu_get_table_id (const ip_pmtu_t *ipt)
+{
+ const fib_prefix_t *pfx;
+ u32 fib_index;
+
+ pfx = fib_entry_get_prefix (ipt->ipt_fib_entry);
+ fib_index = fib_entry_get_fib_index (ipt->ipt_fib_entry);
+
+ return (fib_table_get_table_id (fib_index, pfx->fp_proto));
+}
+
+void
+ip_pmtu_get_ip (const ip_pmtu_t *ipt, ip_address_t *ip)
+{
+ const fib_prefix_t *pfx;
+
+ pfx = fib_entry_get_prefix (ipt->ipt_fib_entry);
+ ip_address_from_46 (&pfx->fp_addr, pfx->fp_proto, ip);
+}
+
+static u8 *
+format_ip_pmtu (u8 *s, va_list *ap)
+{
+ ip_pmtu_t *ipt;
+ index_t ipti = va_arg (*ap, index_t);
+ const fib_prefix_t *pfx;
+ u32 fib_index;
+
+ ipt = pool_elt_at_index (ip_pmtu_pool, ipti);
+ pfx = fib_entry_get_prefix (ipt->ipt_fib_entry);
+ fib_index = fib_entry_get_fib_index (ipt->ipt_fib_entry);
+
+ s =
+ format (s, "[%d] [tbl:[%d:%d]] %U pmtu:[cfg:%d, oper:%d, parent:%d] [%U]",
+ ipti, ip_pmtu_get_table_id (ipt), fib_index, format_fib_prefix,
+ pfx, ipt->ipt_cfg_pmtu, ipt->ipt_oper_pmtu, ipt->ipt_parent_pmtu,
+ format_ip_pmtu_flags, ipt->ipt_flags);
+
+ return (s);
+}
+
+static u8 *
+format_ip_path_mtu_adj_delegate (const adj_delegate_t *aed, u8 *s)
+{
+ ip_path_mtu_adj_delegate_t *ip_adj;
+
+ ip_adj = pool_elt_at_index (ip_path_mtu_adj_delegate_pool, aed->ad_index);
+
+ s = format (s, "IP path-MTU: %d", ip_adj->pmtu);
+
+ return (s);
+}
+
+static void
+ip_pmtu_adj_delegate_adj_created (adj_index_t ai)
+{
+ ip_path_mtu_adj_delegate_t *ipp_ad;
+ const ip_pmtu_t *ipt;
+ ip_adjacency_t *adj;
+ u32 table_id;
+ uword *p;
+
+ adj = adj_get (ai);
+
+ switch (adj->lookup_next_index)
+ {
+ case IP_LOOKUP_NEXT_DROP:
+ case IP_LOOKUP_NEXT_PUNT:
+ case IP_LOOKUP_NEXT_LOCAL:
+ case IP_LOOKUP_NEXT_GLEAN:
+ case IP_LOOKUP_NEXT_MCAST:
+ case IP_LOOKUP_NEXT_BCAST:
+ case IP_LOOKUP_NEXT_MCAST_MIDCHAIN:
+ case IP_LOOKUP_NEXT_ICMP_ERROR:
+ case IP_LOOKUP_N_NEXT:
+ return;
+
+ case IP_LOOKUP_NEXT_ARP:
+ case IP_LOOKUP_NEXT_REWRITE:
+ case IP_LOOKUP_NEXT_MIDCHAIN:
+ break;
+ }
+
+ table_id = fib_table_get_table_id_for_sw_if_index (
+ adj->ia_nh_proto, adj->rewrite_header.sw_if_index);
+
+ ip_pmtu_key_t key = {
+ .nh = adj->sub_type.nbr.next_hop,
+ .table_id = table_id,
+ .fproto = adj->ia_nh_proto,
+ };
+
+ p = hash_get_mem (ip_pmtu_db, &key);
+
+ if (NULL == p)
+ return;
+
+ ipt = pool_elt_at_index (ip_pmtu_pool, p[0]);
+
+ pool_get (ip_path_mtu_adj_delegate_pool, ipp_ad);
+ ipp_ad->pmtu = ipt->ipt_cfg_pmtu;
+
+ adj_delegate_add (adj, ip_pmtu_adj_delegate_type,
+ ipp_ad - ip_path_mtu_adj_delegate_pool);
+
+ adj_nbr_set_mtu (ai, ipp_ad->pmtu);
+
+ IP_PMTU_TRKR_DBG (ipt, "adj-added:", ai);
+}
+
+static void
+ip_pmtu_adj_delegate_adj_deleted (adj_delegate_t *ad)
+{
+ pool_put_index (ip_path_mtu_adj_delegate_pool, ad->ad_index);
+}
+
+static void
+ip_pmtu_adj_delegate_adj_modified (adj_delegate_t *ad)
+{
+ ip_path_mtu_adj_delegate_t *ipp_ad;
+
+ ipp_ad = pool_elt_at_index (ip_path_mtu_adj_delegate_pool, ad->ad_index);
+
+ adj_nbr_set_mtu (ad->ad_adj_index, ipp_ad->pmtu);
+}
+
+const adj_delegate_vft_t ip_path_adj_delegate_vft = {
+ .adv_format = format_ip_path_mtu_adj_delegate,
+ .adv_adj_deleted = ip_pmtu_adj_delegate_adj_deleted,
+ .adv_adj_modified = ip_pmtu_adj_delegate_adj_modified,
+ .adv_adj_created = ip_pmtu_adj_delegate_adj_created,
+};
+
+static bool
+ip_path_mtu_value_invalid (u16 pmtu)
+{
+ return (pmtu == 0 || pmtu == 0xffff);
+}
+
+static adj_walk_rc_t
+ip_ptmu_adj_walk_remove (adj_index_t ai, void *ctx)
+{
+ adj_delegate_t *ad;
+
+ ad = adj_delegate_get (adj_get (ai), ip_pmtu_adj_delegate_type);
+
+ if (ad)
+ {
+ adj_nbr_set_mtu (ai, 0);
+
+ pool_put_index (ip_path_mtu_adj_delegate_pool, ad->ad_index);
+ adj_delegate_remove (ai, ip_pmtu_adj_delegate_type);
+ }
+ return (ADJ_WALK_RC_CONTINUE);
+}
+
+static adj_walk_rc_t
+ip_ptmu_adj_walk_update (adj_index_t ai, void *ctx)
+{
+ ip_path_mtu_adj_delegate_t *ipp_ad;
+ adj_delegate_t *ad;
+ u16 *pmtup;
+
+ pmtup = ctx;
+ ad = adj_delegate_get (adj_get (ai), ip_pmtu_adj_delegate_type);
+
+ if (ad)
+ ipp_ad = pool_elt_at_index (ip_path_mtu_adj_delegate_pool, ad->ad_index);
+ else
+ {
+ pool_get (ip_path_mtu_adj_delegate_pool, ipp_ad);
+
+ adj_delegate_add (adj_get (ai), ip_pmtu_adj_delegate_type,
+ ipp_ad - ip_path_mtu_adj_delegate_pool);
+ }
+
+ ipp_ad->pmtu = *pmtup;
+
+ adj_nbr_set_mtu (ai, ipp_ad->pmtu);
+
+ return (ADJ_WALK_RC_CONTINUE);
+}
+
+static ip_pmtu_dpo_t *
+ip_pmtu_dpo_alloc (void)
+{
+ ip_pmtu_dpo_t *ipm;
+
+ pool_get_aligned_zero (ip_pmtu_dpo_pool, ipm, sizeof (ip_pmtu_dpo_t));
+
+ return (ipm);
+}
+
+static ip_pmtu_dpo_t *
+ip_pmtu_dpo_get_from_dpo (const dpo_id_t *dpo)
+{
+ ASSERT (ip_pmtu_dpo_type == dpo->dpoi_type);
+
+ return (ip_pmtu_dpo_get (dpo->dpoi_index));
+}
+
+static index_t
+ip_pmtu_dpo_get_index (ip_pmtu_dpo_t *ipm)
+{
+ return (ipm - ip_pmtu_dpo_pool);
+}
+
+static void
+ip_pmtu_dpo_lock (dpo_id_t *dpo)
+{
+ ip_pmtu_dpo_t *ipm;
+
+ ipm = ip_pmtu_dpo_get_from_dpo (dpo);
+ ipm->ipm_locks++;
+}
+
+static void
+ip_pmtu_dpo_unlock (dpo_id_t *dpo)
+{
+ ip_pmtu_dpo_t *ipm;
+
+ ipm = ip_pmtu_dpo_get_from_dpo (dpo);
+ ipm->ipm_locks--;
+
+ if (0 == ipm->ipm_locks)
+ {
+ dpo_reset (&ipm->ipm_dpo);
+ pool_put (ip_pmtu_dpo_pool, ipm);
+ }
+}
+
+static u32
+ip_pmtu_dpo_get_urpf (const dpo_id_t *dpo)
+{
+ ip_pmtu_dpo_t *ipm;
+
+ ipm = ip_pmtu_dpo_get_from_dpo (dpo);
+
+ return (dpo_get_urpf (&ipm->ipm_dpo));
+}
+
+void
+ip_pmtu_dpo_add_or_lock (fib_protocol_t fproto, u16 pmtu, dpo_id_t *dpo)
+{
+ ip_pmtu_dpo_t *ipm;
+ dpo_id_t parent = DPO_INVALID;
+
+ ipm = ip_pmtu_dpo_alloc ();
+
+ ipm->ipm_proto = fib_proto_to_dpo (fproto);
+ ipm->ipm_pmtu = pmtu;
+
+ dpo_copy (&parent, drop_dpo_get (ipm->ipm_proto));
+ dpo_stack (ip_pmtu_dpo_type, ipm->ipm_proto, &ipm->ipm_dpo, &parent);
+ dpo_set (dpo, ip_pmtu_dpo_type, ipm->ipm_proto, ip_pmtu_dpo_get_index (ipm));
+}
+
+u8 *
+format_ip_pmtu_dpo (u8 *s, va_list *ap)
+{
+ index_t index = va_arg (*ap, index_t);
+ u32 indent = va_arg (*ap, u32);
+ ip_pmtu_dpo_t *ipm = ip_pmtu_dpo_get (index);
+
+ s = format (s, "ip-pmtu-dpo: %U, mtu:%d", format_dpo_proto, ipm->ipm_proto,
+ ipm->ipm_pmtu);
+ s = format (s, "\n%U", format_white_space, indent + 2);
+ s = format (s, "%U", format_dpo_id, &ipm->ipm_dpo, indent + 4);
+
+ return (s);
+}
+
+/**
+ * Interpose a path MTU DPO
+ */
+static void
+ip_pmtu_dpo_interpose (const dpo_id_t *original, const dpo_id_t *parent,
+ dpo_id_t *clone)
+{
+ ip_pmtu_dpo_t *ipm, *ipm_clone;
+
+ ipm_clone = ip_pmtu_dpo_alloc ();
+ ipm = ip_pmtu_dpo_get (original->dpoi_index);
+
+ ipm_clone->ipm_proto = ipm->ipm_proto;
+ ipm_clone->ipm_pmtu = ipm->ipm_pmtu;
+
+ dpo_stack (ip_pmtu_dpo_type, ipm_clone->ipm_proto, &ipm_clone->ipm_dpo,
+ parent);
+ dpo_set (clone, ip_pmtu_dpo_type, ipm_clone->ipm_proto,
+ ip_pmtu_dpo_get_index (ipm_clone));
+}
+
+static u16
+ip_pmtu_dpo_get_mtu (const dpo_id_t *dpo)
+{
+ ip_pmtu_dpo_t *ipd;
+
+ ipd = pool_elt_at_index (ip_pmtu_dpo_pool, dpo->dpoi_index);
+
+ return (ipd->ipm_pmtu);
+}
+
+const static dpo_vft_t ip_pmtu_dpo_vft = {
+ .dv_lock = ip_pmtu_dpo_lock,
+ .dv_unlock = ip_pmtu_dpo_unlock,
+ .dv_format = format_ip_pmtu_dpo,
+ .dv_get_urpf = ip_pmtu_dpo_get_urpf,
+ .dv_mk_interpose = ip_pmtu_dpo_interpose,
+ .dv_get_mtu = ip_pmtu_dpo_get_mtu,
+};
+
+/**
+ * @brief The per-protocol VLIB graph nodes that are assigned to a glean
+ * object.
+ *
+ * this means that these graph nodes are ones from which a glean is the
+ * parent object in the DPO-graph.
+ */
+const static char *const ip_pmtu_dpo_ip4_nodes[] = {
+ "ip4-pmtu-dpo",
+ NULL,
+};
+
+const static char *const ip_pmtu_dpo_ip6_nodes[] = {
+ "ip6-pmtu-dpo",
+ NULL,
+};
+
+const static char *const *const ip_pmtu_dpo_nodes[DPO_PROTO_NUM] = {
+ [DPO_PROTO_IP4] = ip_pmtu_dpo_ip4_nodes,
+ [DPO_PROTO_IP6] = ip_pmtu_dpo_ip6_nodes,
+};
+
+static bool
+ip_mtu_fib_entry_is_attached (fib_node_index_t fib_entry)
+{
+ const fib_prefix_t *pfx;
+ u32 cover, fib_index;
+
+ fib_index = fib_entry_get_fib_index (fib_entry);
+ pfx = fib_entry_get_prefix (fib_entry);
+
+ /*
+ * If the tracked prefix's cover is attached, then all packets that
+ * are forwarded to this neighbour will use the adjacency, this is a
+ * more efficient place to perform the MTU check and fragging
+ */
+ cover = fib_table_get_less_specific (fib_index, pfx);
+
+ return (FIB_ENTRY_FLAG_ATTACHED & fib_entry_get_flags (cover) ||
+ FIB_ENTRY_FLAG_ATTACHED & fib_entry_get_flags (fib_entry));
+}
+
+static index_t
+ip_pmtu_alloc (u32 fib_index, const fib_prefix_t *pfx,
+ const ip_pmtu_key_t *key, u16 pmtu)
+{
+ dpo_id_t ip_dpo = DPO_INVALID;
+ ip_pmtu_t *ipt;
+ fib_node_index_t cover;
+ const dpo_id_t *lb_dpo;
+ index_t ipti;
+
+ pool_get (ip_pmtu_pool, ipt);
+ fib_node_init (&(ipt->ipt_node), ip_pmtu_fib_type);
+
+ ipti = ipt - ip_pmtu_pool;
+ hash_set_mem_alloc (&ip_pmtu_db, key, ipti);
+
+ ipt->ipt_cfg_pmtu = pmtu;
+ ipt->ipt_fib_entry = fib_entry_track (fib_index, pfx, ip_pmtu_fib_type, ipti,
+ &ipt->ipt_sibling);
+
+ /*
+ * If the tracked prefix's cover is attached, then all packets that
+ * are forwarded to this neighbour will use the adjacency, this is a
+ * more efficient place to perform the MTU check and fragging
+ */
+ cover = fib_table_get_less_specific (fib_index, pfx);
+
+ if (ip_mtu_fib_entry_is_attached (ipt->ipt_fib_entry))
+ {
+ u32 sw_if_index;
+
+ ipt->ipt_flags |= IP_PMTU_FLAG_ATTACHED;
+ ipt->ipt_oper_pmtu = ipt->ipt_cfg_pmtu;
+
+ sw_if_index = fib_entry_get_resolving_interface (cover);
+
+ /* walk all adjs to add/update delegate */
+ adj_nbr_walk_nh (sw_if_index, pfx->fp_proto, &pfx->fp_addr,
+ ip_ptmu_adj_walk_update, &ipt->ipt_oper_pmtu);
+ }
+ else
+ {
+ ipt->ipt_flags |= IP_PMTU_FLAG_REMOTE;
+
+ lb_dpo = fib_entry_contribute_ip_forwarding (ipt->ipt_fib_entry);
+
+ ipt->ipt_oper_pmtu = clib_min (dpo_get_mtu (lb_dpo), ipt->ipt_cfg_pmtu);
+
+ /*
+ * interpose a policy DPO from the nh so that MTU is applied
+ */
+ ip_pmtu_dpo_add_or_lock (pfx->fp_proto, ipt->ipt_oper_pmtu, &ip_dpo);
+
+ fib_table_entry_special_dpo_add (fib_index, pfx, ip_pmtu_source,
+ FIB_ENTRY_FLAG_INTERPOSE, &ip_dpo);
+ dpo_reset (&ip_dpo);
+ }
+
+ IP_PMTU_TRKR_DBG (ipt, "create");
+
+ return (ipti);
+}
+
+static void
+ip_pmtu_stack (ip_pmtu_t *ipt)
+{
+ bool was_attached, is_attached;
+ const fib_prefix_t *pfx;
+ u32 fib_index;
+
+ pfx = fib_entry_get_prefix (ipt->ipt_fib_entry);
+ fib_index = fib_entry_get_fib_index (ipt->ipt_fib_entry);
+
+ was_attached = !!(ipt->ipt_flags & IP_PMTU_FLAG_ATTACHED);
+ is_attached = ip_mtu_fib_entry_is_attached (ipt->ipt_fib_entry);
+
+ if (was_attached && !is_attached)
+ {
+ /* transition from attached to remote - walk all adjs to remove delegate
+ */
+ adj_nbr_walk_nh (fib_entry_get_resolving_interface (ipt->ipt_fib_entry),
+ pfx->fp_proto, &pfx->fp_addr, ip_ptmu_adj_walk_remove,
+ &ipt->ipt_oper_pmtu);
+ ipt->ipt_flags &= ~IP_PMTU_FLAG_ATTACHED;
+ }
+ if (!was_attached && is_attached)
+ {
+ /* transition from remote to attached - remove the DPO */
+ fib_table_entry_special_remove (fib_index, pfx, ip_pmtu_source);
+ ipt->ipt_flags &= ~IP_PMTU_FLAG_REMOTE;
+ }
+
+ if (is_attached)
+ {
+ /* walk all adjs to add/update delegate */
+ ipt->ipt_oper_pmtu = ipt->ipt_cfg_pmtu;
+ adj_nbr_walk_nh (fib_entry_get_resolving_interface (ipt->ipt_fib_entry),
+ pfx->fp_proto, &pfx->fp_addr, ip_ptmu_adj_walk_update,
+ &ipt->ipt_oper_pmtu);
+ ipt->ipt_flags |= IP_PMTU_FLAG_ATTACHED;
+ }
+ else
+ {
+ const dpo_id_t *lb_dpo;
+ u16 dpo_mtu;
+
+ fib_table_entry_special_remove (fib_index, pfx, ip_pmtu_source);
+
+ ipt->ipt_flags |= IP_PMTU_FLAG_REMOTE;
+ lb_dpo = fib_entry_contribute_ip_forwarding (ipt->ipt_fib_entry);
+ dpo_mtu = dpo_get_mtu (lb_dpo);
+
+ ipt->ipt_oper_pmtu = clib_min (dpo_mtu, ipt->ipt_cfg_pmtu);
+
+ /*
+ * if the configured path-MTU is less that the egress/interface then
+ * interpose a policy DPO from the nh so that MTU is applied
+ */
+ if (ipt->ipt_oper_pmtu < dpo_mtu)
+ {
+ dpo_id_t ip_dpo = DPO_INVALID;
+
+ ip_pmtu_dpo_add_or_lock (pfx->fp_proto, ipt->ipt_oper_pmtu, &ip_dpo);
+
+ fib_table_entry_special_dpo_update (
+ fib_index, pfx, ip_pmtu_source, FIB_ENTRY_FLAG_INTERPOSE, &ip_dpo);
+ dpo_reset (&ip_dpo);
+ }
+ }
+ IP_PMTU_TRKR_DBG (ipt, "stack");
+}
+
+static void
+ip_pmtu_update (index_t ipti, u16 pmtu)
+{
+ ip_pmtu_t *ipt;
+
+ ipt = pool_elt_at_index (ip_pmtu_pool, ipti);
+ ipt->ipt_flags &= ~IP_PMTU_FLAG_STALE;
+ ipt->ipt_cfg_pmtu = pmtu;
+
+ ip_pmtu_stack (ipt);
+}
+
+static index_t
+ip_pmtu_destroy (index_t ipti, const ip_pmtu_key_t *key)
+{
+ ip_pmtu_t *ipt;
+ const fib_prefix_t *pfx;
+
+ ipt = pool_elt_at_index (ip_pmtu_pool, ipti);
+ pfx = fib_entry_get_prefix (ipt->ipt_fib_entry);
+
+ IP_PMTU_TRKR_DBG (ipt, "destroy");
+
+ if (ipt->ipt_flags & IP_PMTU_FLAG_REMOTE)
+ fib_table_entry_special_remove (
+ fib_entry_get_fib_index (ipt->ipt_fib_entry), pfx, ip_pmtu_source);
+ else
+ /* remove the delegate from all the adjacencies */
+ adj_nbr_walk_nh (fib_entry_get_resolving_interface (ipt->ipt_fib_entry),
+ pfx->fp_proto, &pfx->fp_addr, ip_ptmu_adj_walk_remove,
+ NULL);
+
+ /*
+ * Drop the fib entry we're tracking
+ */
+ fib_entry_untrack (ipt->ipt_fib_entry, ipt->ipt_sibling);
+
+ /*
+ * remove from DB and return to pool
+ */
+ hash_unset_mem_free (&ip_pmtu_db, key);
+ pool_put (ip_pmtu_pool, ipt);
+
+ return (ipti);
+}
+
+int
+ip_path_mtu_update (const ip_address_t *nh, u32 table_id, u16 pmtu)
+{
+ fib_prefix_t pfx;
+ u32 fib_index;
+ uword *p;
+
+ ip_address_to_fib_prefix (nh, &pfx);
+ fib_index = fib_table_find (pfx.fp_proto, table_id);
+
+ if (~0 == fib_index)
+ return (VNET_API_ERROR_NO_SUCH_TABLE);
+
+ ip_pmtu_key_t key = {
+ .fproto = pfx.fp_proto,
+ .table_id = table_id,
+ .nh = pfx.fp_addr,
+ };
+
+ p = hash_get_mem (ip_pmtu_db, &key);
+
+ if (!ip_path_mtu_value_invalid (pmtu))
+ {
+ /* Add or update of path MTU */
+ if (NULL == p)
+ ip_pmtu_alloc (fib_index, &pfx, &key, pmtu);
+ else
+ ip_pmtu_update (p[0], pmtu);
+ }
+ else
+ {
+ if (NULL != p)
+ ip_pmtu_destroy (p[0], &key);
+ }
+
+ return (0);
+}
+
+static walk_rc_t
+ip_path_mtu_walk_mark (index_t ipti, void *ctx)
+{
+ ip_pmtu_t *ipt;
+
+ ipt = ip_path_mtu_get (ipti);
+
+ ipt->ipt_flags |= IP_PMTU_FLAG_STALE;
+
+ return (WALK_CONTINUE);
+}
+
+typedef struct ip_path_mtu_walk_sweep_ctx_t_
+{
+ index_t *indicies;
+} ip_path_mtu_walk_sweep_ctx_t;
+
+static walk_rc_t
+ip_path_mtu_walk_sweep (index_t ipti, void *arg)
+{
+ ip_path_mtu_walk_sweep_ctx_t *ctx = arg;
+ ip_pmtu_t *ipt;
+
+ ipt = ip_path_mtu_get (ipti);
+
+ if (ipt->ipt_flags & IP_PMTU_FLAG_STALE)
+ {
+ vec_add1 (ctx->indicies, ipti);
+ }
+
+ return (WALK_CONTINUE);
+}
+
+int
+ip_path_mtu_replace_begin (void)
+{
+ IP_PMTU_DBG ("replace-begin");
+
+ ip_path_mtu_walk (ip_path_mtu_walk_mark, NULL);
+
+ return (0);
+}
+
+int
+ip_path_mtu_replace_end (void)
+{
+ index_t *ipti;
+
+ IP_PMTU_DBG ("replace-end");
+
+ /*
+ * not safe to walk the pool whilst deleting, so create
+ * temporary storage of stale entries
+ */
+ ip_path_mtu_walk_sweep_ctx_t ctx = {
+ .indicies = NULL,
+ };
+
+ ip_path_mtu_walk (ip_path_mtu_walk_sweep, &ctx);
+
+ vec_foreach (ipti, ctx.indicies)
+ {
+ ip_pmtu_t *ipt;
+ ip_address_t ip;
+
+ ipt = ip_path_mtu_get (*ipti);
+ ip_pmtu_get_ip (ipt, &ip);
+ ip_path_mtu_update (&ip, ip_pmtu_get_table_id (ipt), 0);
+ }
+
+ vec_free (ctx.indicies);
+
+ return (0);
+}
+
+void
+ip_path_mtu_walk (ip_path_mtu_walk_t fn, void *ctx)
+{
+ index_t ipmi;
+
+ pool_foreach_index (ipmi, ip_pmtu_pool)
+ {
+ if (WALK_STOP == fn (ipmi, ctx))
+ break;
+ }
+}
+
+static fib_node_t *
+ip_pmtu_get_node (fib_node_index_t index)
+{
+ ip_pmtu_t *ipt;
+
+ ipt = pool_elt_at_index (ip_pmtu_pool, index);
+
+ return (&(ipt->ipt_node));
+}
+
+static ip_pmtu_t *
+ip_pmtu_get_from_node (fib_node_t *node)
+{
+ return (
+ (ip_pmtu_t *) (((char *) node) - STRUCT_OFFSET_OF (ip_pmtu_t, ipt_node)));
+}
+
+static void
+ip_pmtu_last_lock_gone (fib_node_t *node)
+{
+ /*
+ * the lifetime of the entry is managed by the API.
+ */
+ ASSERT (0);
+}
+
+/*
+ * A back walk has reached this BIER entry
+ */
+static fib_node_back_walk_rc_t
+ip_pmtu_back_walk_notify (fib_node_t *node, fib_node_back_walk_ctx_t *ctx)
+{
+ /*
+ * re-populate the ECMP tables with new choices
+ */
+ ip_pmtu_t *ipr = ip_pmtu_get_from_node (node);
+
+ ip_pmtu_stack (ipr);
+
+ /*
+ * no need to propagate further up the graph, since there's nothing there
+ */
+ return (FIB_NODE_BACK_WALK_CONTINUE);
+}
+
+static const fib_node_vft_t ip_ptmu_fib_node_vft = {
+ .fnv_get = ip_pmtu_get_node,
+ .fnv_last_lock = ip_pmtu_last_lock_gone,
+ .fnv_back_walk = ip_pmtu_back_walk_notify,
+};
+
+static clib_error_t *
+ip_path_module_init (vlib_main_t *vm)
+{
+ ip_pmtu_adj_delegate_type =
+ adj_delegate_register_new_type (&ip_path_adj_delegate_vft);
+ ip_pmtu_source = fib_source_allocate ("path-mtu", FIB_SOURCE_PRIORITY_HI,
+ FIB_SOURCE_BH_SIMPLE);
+ ip_pmtu_fib_type = fib_node_register_new_type (&ip_ptmu_fib_node_vft);
+
+ ip_pmtu_db = hash_create_mem (0, sizeof (ip_pmtu_key_t), sizeof (index_t));
+ ip_pmtu_logger = vlib_log_register_class ("ip", "pmtu");
+ ip_pmtu_dpo_type =
+ dpo_register_new_type (&ip_pmtu_dpo_vft, ip_pmtu_dpo_nodes);
+
+ return (NULL);
+}
+
+VLIB_INIT_FUNCTION (ip_path_module_init);
+
+static clib_error_t *
+show_ip_pmtu_command (vlib_main_t *vm, unformat_input_t *input,
+ vlib_cli_command_t *cmd)
+{
+ index_t ipti;
+
+ if (unformat (input, "%d", &ipti))
+ {
+ /*
+ * show one in detail
+ */
+ if (!pool_is_free_index (ip_pmtu_pool, ipti))
+ vlib_cli_output (vm, "%U", format_ip_pmtu, ipti);
+ else
+ vlib_cli_output (vm, "entry %d invalid", ipti);
+ }
+ else
+ {
+ /*
+ * show all
+ */
+ pool_foreach_index (ipti, ip_pmtu_pool)
+ {
+ vlib_cli_output (vm, "%U", format_ip_pmtu, ipti);
+ }
+ }
+
+ return (NULL);
+}
+
+VLIB_CLI_COMMAND (show_fib_entry, static) = {
+ .path = "show ip pmtu",
+ .function = show_ip_pmtu_command,
+ .short_help = "show ip path MTU",
+};
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/ip/ip_path_mtu.h b/src/vnet/ip/ip_path_mtu.h
new file mode 100644
index 00000000000..2c54fcd7401
--- /dev/null
+++ b/src/vnet/ip/ip_path_mtu.h
@@ -0,0 +1,126 @@
+/*
+ *------------------------------------------------------------------
+ * ip_path_mtu.h
+ *
+ * Copyright (c) 2021 Graphiant.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <vnet/ip/ip.h>
+
+/**
+ * @brief
+ * The Path MTU DPO. interposed in the forwarding chain of the host prefix.
+ */
+typedef struct ip_pmtu_dpo_t_
+{
+ /**
+ * The protocol of packets using this DPO
+ */
+ dpo_proto_t ipm_proto;
+
+ u8 __pad8;
+
+ /**
+ * Configured Path Mtu
+ */
+ u16 ipm_pmtu;
+
+ /**
+ * number of locks.
+ */
+ u16 ipm_locks;
+
+ /**
+ * Stacked DPO
+ */
+ dpo_id_t ipm_dpo;
+} ip_pmtu_dpo_t;
+
+/*
+ * PMTU DPOs are accessed in the data-path so they should not straddle a cache
+ * line. Align to a integer factor of a cacheline
+ */
+STATIC_ASSERT_SIZEOF (ip_pmtu_dpo_t, 2 * sizeof (u64));
+
+#define foreach_ip_pmtu_flag \
+ _ (ATTACHED, 0, "attached") \
+ _ (REMOTE, 1, "remote") \
+ _ (STALE, 2, "stale")
+
+typedef enum ip_pmtu_flags_t_
+{
+#define _(a, b, c) IP_PMTU_FLAG_##a = (1 << b),
+ foreach_ip_pmtu_flag
+#undef _
+} ip_pmtu_flags_t;
+
+/**
+ * Remote Path MTU tracking object
+ */
+typedef struct ip_pmtu_t_
+{
+ /** linkage into the FIB graph */
+ fib_node_t ipt_node;
+
+ /** Track fib entry */
+ fib_node_index_t ipt_fib_entry;
+ u32 ipt_sibling;
+ ip_pmtu_flags_t ipt_flags;
+
+ /** Configured MTU */
+ u16 ipt_cfg_pmtu;
+
+ /** MTU from the parent MTU */
+ u16 ipt_parent_pmtu;
+
+ /** operational MTU; the minimum value of the cfg and parent MTU */
+ u16 ipt_oper_pmtu;
+} ip_pmtu_t;
+
+extern int ip_path_mtu_update (const ip_address_t *nh, u32 table_id, u16 pmtu);
+
+typedef walk_rc_t (*ip_path_mtu_walk_t) (index_t ipti, void *ctx);
+
+extern void ip_path_mtu_walk (ip_path_mtu_walk_t fn, void *ctx);
+extern int ip_path_mtu_replace_begin (void);
+extern int ip_path_mtu_replace_end (void);
+
+extern u32 ip_pmtu_get_table_id (const ip_pmtu_t *ipt);
+extern void ip_pmtu_get_ip (const ip_pmtu_t *ipt, ip_address_t *ip);
+
+/**
+ * Data-plane accessor functions
+ */
+extern ip_pmtu_dpo_t *ip_pmtu_dpo_pool;
+static_always_inline ip_pmtu_dpo_t *
+ip_pmtu_dpo_get (index_t index)
+{
+ return (pool_elt_at_index (ip_pmtu_dpo_pool, index));
+}
+
+extern ip_pmtu_t *ip_pmtu_pool;
+static_always_inline ip_pmtu_t *
+ip_path_mtu_get (index_t index)
+{
+ return (pool_elt_at_index (ip_pmtu_pool, index));
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/ip/ip_path_mtu_node.c b/src/vnet/ip/ip_path_mtu_node.c
new file mode 100644
index 00000000000..b13f9de849c
--- /dev/null
+++ b/src/vnet/ip/ip_path_mtu_node.c
@@ -0,0 +1,206 @@
+/*
+ *------------------------------------------------------------------
+ * ip_path_mtu.c
+ *
+ * Copyright (c) 2020 Graphiant.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <vnet/ip/ip_path_mtu.h>
+#include <vnet/ip/ip_frag.h>
+
+typedef enum
+{
+ IP_PMTU_DROP,
+ IP_PMTU_N_NEXT,
+} ip_pmtu_next_t;
+
+typedef struct ip_pmtu_trace_t_
+{
+ u16 pmtu;
+ u16 packet_size;
+} ip_pmtu_trace_t;
+
+static u8 *
+format_ip_pmtu_trace (u8 *s, va_list *args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ ip_pmtu_trace_t *t = va_arg (*args, ip_pmtu_trace_t *);
+
+ s = format (s, "path mtu:%d packet size:%d", t->pmtu, t->packet_size);
+
+ return s;
+}
+
+static inline uword
+ip_pmtu_dpo_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
+ vlib_frame_t *frame, ip_address_family_t af)
+{
+ u32 n_left_from, *from, next_index, *to_next, n_left_to_next;
+ u32 frag_sent = 0, small_packets = 0;
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ u32 *buffer = 0;
+
+ while (n_left_from > 0)
+ {
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ const ip_pmtu_dpo_t *ipm0;
+ u32 pi0, *frag_from, frag_left;
+ vlib_buffer_t *p0;
+ ip_frag_error_t error0;
+ u16 next0;
+
+ /*
+ * Note: The packet is not enqueued now. It is instead put
+ * in a vector where other fragments will be put as well.
+ */
+ pi0 = from[0];
+ from += 1;
+ n_left_from -= 1;
+
+ p0 = vlib_get_buffer (vm, pi0);
+ ipm0 = ip_pmtu_dpo_get (vnet_buffer (p0)->ip.adj_index[VLIB_TX]);
+ vnet_buffer (p0)->ip.adj_index[VLIB_TX] = ipm0->ipm_dpo.dpoi_index;
+ next0 = ipm0->ipm_dpo.dpoi_next_node;
+
+ if (PREDICT_FALSE (p0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ ip_pmtu_trace_t *t;
+ t = vlib_add_trace (vm, node, p0, sizeof (*t));
+ t->pmtu = ipm0->ipm_pmtu;
+ t->packet_size = vlib_buffer_length_in_chain (vm, p0);
+ }
+
+ if (AF_IP6 == af)
+ error0 =
+ ip6_frag_do_fragment (vm, pi0, ipm0->ipm_pmtu, 0, &buffer);
+ else
+ error0 =
+ ip4_frag_do_fragment (vm, pi0, ipm0->ipm_pmtu, 0, &buffer);
+
+ if (AF_IP4 == af && error0 == IP_FRAG_ERROR_DONT_FRAGMENT_SET)
+ {
+ icmp4_error_set_vnet_buffer (
+ p0, ICMP4_destination_unreachable,
+ ICMP4_destination_unreachable_fragmentation_needed_and_dont_fragment_set,
+ ipm0->ipm_pmtu);
+ next0 = IP_FRAG_NEXT_ICMP_ERROR;
+ }
+ else
+ {
+ next0 =
+ (error0 == IP_FRAG_ERROR_NONE ? next0 : IP_FRAG_NEXT_DROP);
+ }
+
+ if (error0 == IP_FRAG_ERROR_NONE)
+ {
+ /* Free original buffer chain */
+ frag_sent += vec_len (buffer);
+ small_packets += (vec_len (buffer) == 1);
+ vlib_buffer_free_one (vm, pi0); /* Free original packet */
+ }
+ else
+ {
+ vlib_error_count (vm, node->node_index, error0, 1);
+ vec_add1 (buffer, pi0); /* Get rid of the original buffer */
+ }
+
+ /* Send fragments that were added in the frame */
+ frag_from = buffer;
+ frag_left = vec_len (buffer);
+
+ while (frag_left > 0)
+ {
+ while (frag_left > 0 && n_left_to_next > 0)
+ {
+ u32 i;
+ i = to_next[0] = frag_from[0];
+ frag_from += 1;
+ frag_left -= 1;
+ to_next += 1;
+ n_left_to_next -= 1;
+
+ vlib_get_buffer (vm, i)->error = node->errors[error0];
+ vlib_validate_buffer_enqueue_x1 (
+ vm, node, next_index, to_next, n_left_to_next, i, next0);
+ }
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ vlib_get_next_frame (vm, node, next_index, to_next,
+ n_left_to_next);
+ }
+ vec_reset_length (buffer);
+ }
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+ vec_free (buffer);
+
+ return frame->n_vectors;
+}
+
+// clang-format off
+
+VLIB_NODE_FN (ip4_ip_pmtu_dpo_node) (vlib_main_t *vm,
+ vlib_node_runtime_t *node,
+ vlib_frame_t *from_frame)
+{
+ return (ip_pmtu_dpo_inline (vm, node, from_frame, 0));
+}
+
+VLIB_NODE_FN (ip6_ip_pmtu_dpo_node) (vlib_main_t *vm,
+ vlib_node_runtime_t *node,
+ vlib_frame_t *from_frame)
+{
+ return (ip_pmtu_dpo_inline (vm, node, from_frame, 1));
+}
+
+VLIB_REGISTER_NODE (ip4_ip_pmtu_dpo_node) = {
+ .name = "ip4-pmtu-dpo",
+ .vector_size = sizeof (u32),
+ .format_trace = format_ip_pmtu_trace,
+ .n_errors = 0,
+ .n_next_nodes = IP_PMTU_N_NEXT,
+ .next_nodes =
+ {
+ [IP_PMTU_DROP] = "ip4-drop",
+ }
+};
+VLIB_REGISTER_NODE (ip6_ip_pmtu_dpo_node) = {
+ .name = "ip6-pmtu-dpo",
+ .vector_size = sizeof (u32),
+ .format_trace = format_ip_pmtu_trace,
+ .n_errors = 0,
+ .n_next_nodes = IP_PMTU_N_NEXT,
+ .next_nodes =
+ {
+ [IP_PMTU_DROP] = "ip6-drop",
+ }
+};
+
+// clang-format on
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */