aboutsummaryrefslogtreecommitdiffstats
path: root/src/vnet/ip/reass
diff options
context:
space:
mode:
Diffstat (limited to 'src/vnet/ip/reass')
-rw-r--r--src/vnet/ip/reass/ip4_full_reass.c687
-rw-r--r--src/vnet/ip/reass/ip4_full_reass.h3
-rw-r--r--src/vnet/ip/reass/ip4_sv_reass.c486
-rw-r--r--src/vnet/ip/reass/ip4_sv_reass.h1
-rw-r--r--src/vnet/ip/reass/ip6_full_reass.c741
-rw-r--r--src/vnet/ip/reass/ip6_full_reass.h2
-rw-r--r--src/vnet/ip/reass/ip6_sv_reass.c309
-rw-r--r--src/vnet/ip/reass/ip6_sv_reass.h1
-rw-r--r--src/vnet/ip/reass/reassembly.rst221
9 files changed, 1725 insertions, 726 deletions
diff --git a/src/vnet/ip/reass/ip4_full_reass.c b/src/vnet/ip/reass/ip4_full_reass.c
index d2069c0876c..bab7d479dcf 100644
--- a/src/vnet/ip/reass/ip4_full_reass.c
+++ b/src/vnet/ip/reass/ip4_full_reass.c
@@ -23,16 +23,21 @@
#include <vppinfra/vec.h>
#include <vnet/vnet.h>
#include <vnet/ip/ip.h>
+#include <vnet/ip/ip.api_enum.h>
#include <vppinfra/fifo.h>
#include <vppinfra/bihash_16_8.h>
#include <vnet/ip/reass/ip4_full_reass.h>
#include <stddef.h>
#define MSEC_PER_SEC 1000
-#define IP4_REASS_TIMEOUT_DEFAULT_MS 100
-#define IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS 10000 // 10 seconds default
+#define IP4_REASS_TIMEOUT_DEFAULT_MS 200
+
+/* As there are only 1024 reass context per thread, either the DDOS attacks
+ * or fractions of real timeouts, would consume these contexts quickly and
+ * running out context space and unable to perform reassembly */
+#define IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS 50 // 50 ms default
#define IP4_REASS_MAX_REASSEMBLIES_DEFAULT 1024
-#define IP4_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT 3
+#define IP4_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT 3
#define IP4_REASS_HT_LOAD_FACTOR (0.75)
#define IP4_REASS_DEBUG_BUFFERS 0
@@ -68,21 +73,19 @@ typedef enum
typedef struct
{
- union
+ struct
{
- struct
- {
- u32 xx_id;
- ip4_address_t src;
- ip4_address_t dst;
- u16 frag_id;
- u8 proto;
- u8 unused;
- };
- u64 as_u64[2];
+ u16 frag_id;
+ u8 proto;
+ u8 unused;
+ u32 fib_index;
+ ip4_address_t src;
+ ip4_address_t dst;
};
} ip4_full_reass_key_t;
+STATIC_ASSERT_SIZEOF (ip4_full_reass_key_t, 16);
+
typedef union
{
struct
@@ -155,6 +158,8 @@ typedef struct
ip4_full_reass_t *pool;
u32 reass_n;
u32 id_counter;
+ // for pacing the main thread timeouts
+ u32 last_id;
clib_spinlock_t lock;
} ip4_full_reass_per_thread_t;
@@ -177,17 +182,19 @@ typedef struct
// convenience
vlib_main_t *vlib_main;
- // node index of ip4-drop node
- u32 ip4_drop_idx;
u32 ip4_full_reass_expire_node_idx;
/** Worker handoff */
u32 fq_index;
+ u32 fq_local_index;
u32 fq_feature_index;
u32 fq_custom_index;
// reference count for enabling/disabling feature - per interface
u32 *feature_use_refcount_per_intf;
+
+ // whether local fragmented packets are reassembled or not
+ int is_local_reass_enabled;
} ip4_full_reass_main_t;
extern ip4_full_reass_main_t ip4_full_reass_main;
@@ -219,6 +226,7 @@ typedef enum
RANGE_OVERLAP,
FINALIZE,
HANDOFF,
+ PASSTHROUGH,
} ip4_full_reass_trace_operation_e;
typedef struct
@@ -329,6 +337,9 @@ format_ip4_full_reass_trace (u8 * s, va_list * args)
format (s, "handoff from thread #%u to thread #%u", t->thread_id,
t->thread_id_to);
break;
+ case PASSTHROUGH:
+ s = format (s, "passthrough - not a fragment");
+ break;
}
return s;
}
@@ -404,13 +415,16 @@ ip4_full_reass_free (ip4_full_reass_main_t * rm,
ip4_full_reass_per_thread_t * rt,
ip4_full_reass_t * reass)
{
- clib_bihash_kv_16_8_t kv;
- kv.key[0] = reass->key.as_u64[0];
- kv.key[1] = reass->key.as_u64[1];
+ clib_bihash_kv_16_8_t kv = {};
+ clib_memcpy_fast (&kv, &reass->key, sizeof (kv.key));
clib_bihash_add_del_16_8 (&rm->hash, &kv, 0);
return ip4_full_reass_free_ctx (rt, reass);
}
+/* n_left_to_next, and to_next are taken as input params, as this function
+ * could be called from a graphnode, where its managing local copy of these
+ * variables, and ignoring those and still trying to enqueue the buffers
+ * with local variables would cause either buffer leak or corruption */
always_inline void
ip4_full_reass_drop_all (vlib_main_t *vm, vlib_node_runtime_t *node,
ip4_full_reass_t *reass)
@@ -419,58 +433,103 @@ ip4_full_reass_drop_all (vlib_main_t *vm, vlib_node_runtime_t *node,
vlib_buffer_t *range_b;
vnet_buffer_opaque_t *range_vnb;
u32 *to_free = NULL;
+
while (~0 != range_bi)
{
range_b = vlib_get_buffer (vm, range_bi);
range_vnb = vnet_buffer (range_b);
- u32 bi = range_bi;
- while (~0 != bi)
+
+ if (~0 != range_bi)
{
- vec_add1 (to_free, bi);
- vlib_buffer_t *b = vlib_get_buffer (vm, bi);
- if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
- {
- bi = b->next_buffer;
- b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
- }
- else
- {
- bi = ~0;
- }
+ vec_add1 (to_free, range_bi);
}
+
range_bi = range_vnb->ip.reass.next_range_bi;
}
+
/* send to next_error_index */
- if (~0 != reass->error_next_index)
+ if (~0 != reass->error_next_index &&
+ reass->error_next_index < node->n_next_nodes)
{
- u32 n_left_to_next, *to_next, next_index;
+ u32 n_free = vec_len (to_free);
+
+ /* record number of packets sent to custom app */
+ vlib_node_increment_counter (vm, node->node_index,
+ IP4_ERROR_REASS_TO_CUSTOM_APP, n_free);
+
+ if (node->flags & VLIB_NODE_FLAG_TRACE)
+ for (u32 i = 0; i < n_free; i++)
+ {
+ vlib_buffer_t *b = vlib_get_buffer (vm, to_free[i]);
+ if (PREDICT_FALSE (b->flags & VLIB_BUFFER_IS_TRACED))
+ ip4_full_reass_add_trace (vm, node, reass, to_free[i],
+ RANGE_DISCARD, 0, ~0);
+ }
- next_index = reass->error_next_index;
- u32 bi = ~0;
+ vlib_buffer_enqueue_to_single_next (vm, node, to_free,
+ reass->error_next_index, n_free);
+ }
+ else
+ {
+ vlib_buffer_free (vm, to_free, vec_len (to_free));
+ }
+ vec_free (to_free);
+}
- while (vec_len (to_free) > 0)
- {
- vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+always_inline void
+sanitize_reass_buffers_add_missing (vlib_main_t *vm, ip4_full_reass_t *reass,
+ u32 *bi0)
+{
+ u32 range_bi = reass->first_bi;
+ vlib_buffer_t *range_b;
+ vnet_buffer_opaque_t *range_vnb;
- while (vec_len (to_free) > 0 && n_left_to_next > 0)
+ while (~0 != range_bi)
+ {
+ range_b = vlib_get_buffer (vm, range_bi);
+ range_vnb = vnet_buffer (range_b);
+ u32 bi = range_bi;
+ if (~0 != bi)
+ {
+ if (bi == *bi0)
+ *bi0 = ~0;
+ if (range_b->flags & VLIB_BUFFER_NEXT_PRESENT)
{
- bi = vec_pop (to_free);
-
- if (~0 != bi)
+ u32 _bi = bi;
+ vlib_buffer_t *_b = vlib_get_buffer (vm, _bi);
+ while (_b->flags & VLIB_BUFFER_NEXT_PRESENT)
{
- to_next[0] = bi;
- to_next += 1;
- n_left_to_next -= 1;
+ if (_b->next_buffer != range_vnb->ip.reass.next_range_bi)
+ {
+ _bi = _b->next_buffer;
+ _b = vlib_get_buffer (vm, _bi);
+ }
+ else
+ {
+ _b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
+ break;
+ }
}
}
- vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ range_bi = range_vnb->ip.reass.next_range_bi;
}
}
- else
+ if (*bi0 != ~0)
{
- vlib_buffer_free (vm, to_free, vec_len (to_free));
+ vlib_buffer_t *fb = vlib_get_buffer (vm, *bi0);
+ vnet_buffer_opaque_t *fvnb = vnet_buffer (fb);
+ if (~0 != reass->first_bi)
+ {
+ fvnb->ip.reass.next_range_bi = reass->first_bi;
+ reass->first_bi = *bi0;
+ }
+ else
+ {
+ reass->first_bi = *bi0;
+ fvnb->ip.reass.next_range_bi = ~0;
+ }
+ *bi0 = ~0;
}
- vec_free (to_free);
}
always_inline void
@@ -484,10 +543,10 @@ ip4_full_reass_init (ip4_full_reass_t * reass)
}
always_inline ip4_full_reass_t *
-ip4_full_reass_find_or_create (vlib_main_t * vm, vlib_node_runtime_t * node,
- ip4_full_reass_main_t * rm,
- ip4_full_reass_per_thread_t * rt,
- ip4_full_reass_kv_t * kv, u8 * do_handoff)
+ip4_full_reass_find_or_create (vlib_main_t *vm, vlib_node_runtime_t *node,
+ ip4_full_reass_main_t *rm,
+ ip4_full_reass_per_thread_t *rt,
+ ip4_full_reass_kv_t *kv, u8 *do_handoff)
{
ip4_full_reass_t *reass;
f64 now;
@@ -510,6 +569,8 @@ again:
if (now > reass->last_heard + rm->timeout)
{
+ vlib_node_increment_counter (vm, node->node_index,
+ IP4_ERROR_REASS_TIMEOUT, 1);
ip4_full_reass_drop_all (vm, node, reass);
ip4_full_reass_free (rm, rt, reass);
reass = NULL;
@@ -538,8 +599,7 @@ again:
++rt->reass_n;
}
- reass->key.as_u64[0] = kv->kv.key[0];
- reass->key.as_u64[1] = kv->kv.key[1];
+ clib_memcpy_fast (&reass->key, &kv->kv.key, sizeof (reass->key));
kv->v.reass_index = (reass - rt->pool);
kv->v.memory_owner_thread_index = vm->thread_index;
reass->last_heard = now;
@@ -568,7 +628,6 @@ ip4_full_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
vlib_buffer_t *last_b = NULL;
u32 sub_chain_bi = reass->first_bi;
u32 total_length = 0;
- u32 buf_cnt = 0;
do
{
u32 tmp_bi = sub_chain_bi;
@@ -605,7 +664,6 @@ ip4_full_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
vlib_buffer_length_in_chain (vm, tmp) - trim_front - trim_end;
while (1)
{
- ++buf_cnt;
if (trim_front)
{
if (trim_front > tmp->current_length)
@@ -755,6 +813,16 @@ ip4_full_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
*next0 = reass->next_index;
}
vnet_buffer (first_b)->ip.reass.estimated_mtu = reass->min_fragment_length;
+
+ /* Keep track of number of successfully reassembled packets and number of
+ * fragments reassembled */
+ vlib_node_increment_counter (vm, node->node_index, IP4_ERROR_REASS_SUCCESS,
+ 1);
+
+ vlib_node_increment_counter (vm, node->node_index,
+ IP4_ERROR_REASS_FRAGMENTS_REASSEMBLED,
+ reass->fragments_n);
+
*error0 = IP4_ERROR_NONE;
ip4_full_reass_free (rm, rt, reass);
reass = NULL;
@@ -1090,199 +1158,216 @@ ip4_full_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node,
}
always_inline uword
-ip4_full_reass_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
- vlib_frame_t * frame, ip4_full_reass_node_type_t type)
+ip4_full_reass_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
+ vlib_frame_t *frame, ip4_full_reass_node_type_t type,
+ bool is_local)
{
u32 *from = vlib_frame_vector_args (frame);
- u32 n_left_from, n_left_to_next, *to_next, next_index;
+ u32 n_left, n_next = 0, to_next[VLIB_FRAME_SIZE];
ip4_full_reass_main_t *rm = &ip4_full_reass_main;
ip4_full_reass_per_thread_t *rt = &rm->per_thread_data[vm->thread_index];
+ u16 nexts[VLIB_FRAME_SIZE];
+
clib_spinlock_lock (&rt->lock);
- n_left_from = frame->n_vectors;
- next_index = node->cached_next_index;
- while (n_left_from > 0)
+ n_left = frame->n_vectors;
+ while (n_left > 0)
{
- vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
-
- while (n_left_from > 0 && n_left_to_next > 0)
- {
- u32 bi0;
- vlib_buffer_t *b0;
- u32 next0;
- u32 error0 = IP4_ERROR_NONE;
+ u32 bi0;
+ vlib_buffer_t *b0;
+ u32 next0;
+ u32 error0 = IP4_ERROR_NONE;
- bi0 = from[0];
- b0 = vlib_get_buffer (vm, bi0);
+ bi0 = from[0];
+ b0 = vlib_get_buffer (vm, bi0);
- ip4_header_t *ip0 = vlib_buffer_get_current (b0);
- if (!ip4_get_fragment_more (ip0) && !ip4_get_fragment_offset (ip0))
+ ip4_header_t *ip0 = vlib_buffer_get_current (b0);
+ if (!ip4_get_fragment_more (ip0) && !ip4_get_fragment_offset (ip0))
+ {
+ // this is a whole packet - no fragmentation
+ if (CUSTOM != type)
{
- // this is a whole packet - no fragmentation
- if (CUSTOM != type)
- {
- next0 = IP4_FULL_REASS_NEXT_INPUT;
- }
- else
- {
- next0 = vnet_buffer (b0)->ip.reass.next_index;
- }
- goto packet_enqueue;
+ next0 = IP4_FULL_REASS_NEXT_INPUT;
}
- const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
- const u32 fragment_length =
- clib_net_to_host_u16 (ip0->length) - ip4_header_bytes (ip0);
- const u32 fragment_last = fragment_first + fragment_length - 1;
- if (fragment_first > fragment_last || fragment_first + fragment_length > UINT16_MAX - 20 || (fragment_length < 8 && ip4_get_fragment_more (ip0))) // 8 is minimum frag length per RFC 791
+ else
{
- next0 = IP4_FULL_REASS_NEXT_DROP;
- error0 = IP4_ERROR_REASS_MALFORMED_PACKET;
- goto packet_enqueue;
+ next0 = vnet_buffer (b0)->ip.reass.next_index;
}
- ip4_full_reass_kv_t kv;
- u8 do_handoff = 0;
-
- kv.k.as_u64[0] =
- (u64) vec_elt (ip4_main.fib_index_by_sw_if_index,
- vnet_buffer (b0)->sw_if_index[VLIB_RX]) |
- (u64) ip0->src_address.as_u32 << 32;
- kv.k.as_u64[1] =
- (u64) ip0->dst_address.
- as_u32 | (u64) ip0->fragment_id << 32 | (u64) ip0->protocol << 48;
-
- ip4_full_reass_t *reass =
- ip4_full_reass_find_or_create (vm, node, rm, rt, &kv,
- &do_handoff);
-
- if (reass)
+ ip4_full_reass_add_trace (vm, node, NULL, bi0, PASSTHROUGH, 0, ~0);
+ goto packet_enqueue;
+ }
+
+ if (is_local && !rm->is_local_reass_enabled)
+ {
+ next0 = IP4_FULL_REASS_NEXT_DROP;
+ goto packet_enqueue;
+ }
+
+ const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
+ const u32 fragment_length =
+ clib_net_to_host_u16 (ip0->length) - ip4_header_bytes (ip0);
+ const u32 fragment_last = fragment_first + fragment_length - 1;
+
+ /* Keep track of received fragments */
+ vlib_node_increment_counter (vm, node->node_index,
+ IP4_ERROR_REASS_FRAGMENTS_RCVD, 1);
+
+ if (fragment_first > fragment_last ||
+ fragment_first + fragment_length > UINT16_MAX - 20 ||
+ (fragment_length < 8 && // 8 is minimum frag length per RFC 791
+ ip4_get_fragment_more (ip0)))
+ {
+ next0 = IP4_FULL_REASS_NEXT_DROP;
+ error0 = IP4_ERROR_REASS_MALFORMED_PACKET;
+ goto packet_enqueue;
+ }
+
+ u32 fib_index = (vnet_buffer (b0)->sw_if_index[VLIB_TX] == (u32) ~0) ?
+ vec_elt (ip4_main.fib_index_by_sw_if_index,
+ vnet_buffer (b0)->sw_if_index[VLIB_RX]) :
+ vnet_buffer (b0)->sw_if_index[VLIB_TX];
+
+ ip4_full_reass_kv_t kv = { .k.fib_index = fib_index,
+ .k.src.as_u32 = ip0->src_address.as_u32,
+ .k.dst.as_u32 = ip0->dst_address.as_u32,
+ .k.frag_id = ip0->fragment_id,
+ .k.proto = ip0->protocol
+
+ };
+ u8 do_handoff = 0;
+
+ ip4_full_reass_t *reass =
+ ip4_full_reass_find_or_create (vm, node, rm, rt, &kv, &do_handoff);
+
+ if (reass)
+ {
+ const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
+ if (0 == fragment_first)
{
- const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
- if (0 == fragment_first)
- {
- reass->sendout_thread_index = vm->thread_index;
- }
+ reass->sendout_thread_index = vm->thread_index;
}
+ }
- if (PREDICT_FALSE (do_handoff))
+ if (PREDICT_FALSE (do_handoff))
+ {
+ next0 = IP4_FULL_REASS_NEXT_HANDOFF;
+ vnet_buffer (b0)->ip.reass.owner_thread_index =
+ kv.v.memory_owner_thread_index;
+ }
+ else if (reass)
+ {
+ u32 handoff_thread_idx;
+ u32 counter = ~0;
+ switch (ip4_full_reass_update (vm, node, rm, rt, reass, &bi0, &next0,
+ &error0, CUSTOM == type,
+ &handoff_thread_idx))
{
+ case IP4_REASS_RC_OK:
+ /* nothing to do here */
+ break;
+ case IP4_REASS_RC_HANDOFF:
next0 = IP4_FULL_REASS_NEXT_HANDOFF;
+ b0 = vlib_get_buffer (vm, bi0);
vnet_buffer (b0)->ip.reass.owner_thread_index =
- kv.v.memory_owner_thread_index;
- }
- else if (reass)
- {
- u32 handoff_thread_idx;
- switch (ip4_full_reass_update
- (vm, node, rm, rt, reass, &bi0, &next0,
- &error0, CUSTOM == type, &handoff_thread_idx))
- {
- case IP4_REASS_RC_OK:
- /* nothing to do here */
- break;
- case IP4_REASS_RC_HANDOFF:
- next0 = IP4_FULL_REASS_NEXT_HANDOFF;
- b0 = vlib_get_buffer (vm, bi0);
- vnet_buffer (b0)->ip.reass.owner_thread_index =
- handoff_thread_idx;
- break;
- case IP4_REASS_RC_TOO_MANY_FRAGMENTS:
- vlib_node_increment_counter (vm, node->node_index,
- IP4_ERROR_REASS_FRAGMENT_CHAIN_TOO_LONG,
- 1);
- ip4_full_reass_drop_all (vm, node, reass);
- ip4_full_reass_free (rm, rt, reass);
- goto next_packet;
- break;
- case IP4_REASS_RC_NO_BUF:
- vlib_node_increment_counter (vm, node->node_index,
- IP4_ERROR_REASS_NO_BUF, 1);
- ip4_full_reass_drop_all (vm, node, reass);
- ip4_full_reass_free (rm, rt, reass);
- goto next_packet;
- break;
- case IP4_REASS_RC_INTERNAL_ERROR:
- /* drop everything and start with a clean slate */
- vlib_node_increment_counter (vm, node->node_index,
- IP4_ERROR_REASS_INTERNAL_ERROR,
- 1);
- ip4_full_reass_drop_all (vm, node, reass);
- ip4_full_reass_free (rm, rt, reass);
- goto next_packet;
- break;
- }
+ handoff_thread_idx;
+ break;
+ case IP4_REASS_RC_TOO_MANY_FRAGMENTS:
+ counter = IP4_ERROR_REASS_FRAGMENT_CHAIN_TOO_LONG;
+ break;
+ case IP4_REASS_RC_NO_BUF:
+ counter = IP4_ERROR_REASS_NO_BUF;
+ break;
+ case IP4_REASS_RC_INTERNAL_ERROR:
+ counter = IP4_ERROR_REASS_INTERNAL_ERROR;
+ /* Sanitization is needed in internal error cases only, as
+ * the incoming packet is already dropped in other cases,
+ * also adding bi0 back to the reassembly list, fixes the
+ * leaking of buffers during internal errors.
+ *
+ * Also it doesnt make sense to send these buffers custom
+ * app, these fragments are with internal errors */
+ sanitize_reass_buffers_add_missing (vm, reass, &bi0);
+ reass->error_next_index = ~0;
+ break;
}
- else
+
+ if (~0 != counter)
{
- next0 = IP4_FULL_REASS_NEXT_DROP;
- error0 = IP4_ERROR_REASS_LIMIT_REACHED;
+ vlib_node_increment_counter (vm, node->node_index, counter, 1);
+ ip4_full_reass_drop_all (vm, node, reass);
+ ip4_full_reass_free (rm, rt, reass);
+ goto next_packet;
}
+ }
+ else
+ {
+ next0 = IP4_FULL_REASS_NEXT_DROP;
+ error0 = IP4_ERROR_REASS_LIMIT_REACHED;
+ }
+ packet_enqueue:
- packet_enqueue:
-
- if (bi0 != ~0)
+ if (bi0 != ~0)
+ {
+ /* bi0 might have been updated by reass_finalize, reload */
+ b0 = vlib_get_buffer (vm, bi0);
+ if (IP4_ERROR_NONE != error0)
{
- to_next[0] = bi0;
- to_next += 1;
- n_left_to_next -= 1;
+ b0->error = node->errors[error0];
+ }
- /* bi0 might have been updated by reass_finalize, reload */
- b0 = vlib_get_buffer (vm, bi0);
- if (IP4_ERROR_NONE != error0)
+ if (next0 == IP4_FULL_REASS_NEXT_HANDOFF)
+ {
+ if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
{
- b0->error = node->errors[error0];
+ ip4_full_reass_add_trace (
+ vm, node, NULL, bi0, HANDOFF, 0,
+ vnet_buffer (b0)->ip.reass.owner_thread_index);
}
+ }
+ else if (FEATURE == type && IP4_ERROR_NONE == error0)
+ {
+ vnet_feature_next (&next0, b0);
+ }
- if (next0 == IP4_FULL_REASS_NEXT_HANDOFF)
- {
- if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
- {
- ip4_full_reass_add_trace (
- vm, node, NULL, bi0, HANDOFF, 0,
- vnet_buffer (b0)->ip.reass.owner_thread_index);
- }
- }
- else if (FEATURE == type && IP4_ERROR_NONE == error0)
- {
- vnet_feature_next (&next0, b0);
- }
- vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
- to_next, n_left_to_next,
- bi0, next0);
- IP4_REASS_DEBUG_BUFFER (bi0, enqueue_next);
+ /* Increment the counter to-custom-app also as this fragment is
+ * also going to application */
+ if (CUSTOM == type)
+ {
+ vlib_node_increment_counter (vm, node->node_index,
+ IP4_ERROR_REASS_TO_CUSTOM_APP, 1);
}
- next_packet:
- from += 1;
- n_left_from -= 1;
+ to_next[n_next] = bi0;
+ nexts[n_next] = next0;
+ n_next++;
+ IP4_REASS_DEBUG_BUFFER (bi0, enqueue_next);
}
- vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ next_packet:
+ from += 1;
+ n_left -= 1;
}
clib_spinlock_unlock (&rt->lock);
+
+ vlib_buffer_enqueue_to_next (vm, node, to_next, nexts, n_next);
return frame->n_vectors;
}
-static char *ip4_full_reass_error_strings[] = {
-#define _(sym, string) string,
- foreach_ip4_error
-#undef _
-};
-
VLIB_NODE_FN (ip4_full_reass_node) (vlib_main_t * vm,
vlib_node_runtime_t * node,
vlib_frame_t * frame)
{
- return ip4_full_reass_inline (vm, node, frame, NORMAL);
+ return ip4_full_reass_inline (vm, node, frame, NORMAL, false /* is_local */);
}
VLIB_REGISTER_NODE (ip4_full_reass_node) = {
.name = "ip4-full-reassembly",
.vector_size = sizeof (u32),
.format_trace = format_ip4_full_reass_trace,
- .n_errors = ARRAY_LEN (ip4_full_reass_error_strings),
- .error_strings = ip4_full_reass_error_strings,
+ .n_errors = IP4_N_ERROR,
+ .error_counters = ip4_error_counters,
.n_next_nodes = IP4_FULL_REASS_N_NEXT,
.next_nodes =
{
@@ -1293,19 +1378,42 @@ VLIB_REGISTER_NODE (ip4_full_reass_node) = {
},
};
+VLIB_NODE_FN (ip4_local_full_reass_node)
+(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
+{
+ return ip4_full_reass_inline (vm, node, frame, NORMAL, true /* is_local */);
+}
+
+VLIB_REGISTER_NODE (ip4_local_full_reass_node) = {
+ .name = "ip4-local-full-reassembly",
+ .vector_size = sizeof (u32),
+ .format_trace = format_ip4_full_reass_trace,
+ .n_errors = IP4_N_ERROR,
+ .error_counters = ip4_error_counters,
+ .n_next_nodes = IP4_FULL_REASS_N_NEXT,
+ .next_nodes =
+ {
+ [IP4_FULL_REASS_NEXT_INPUT] = "ip4-input",
+ [IP4_FULL_REASS_NEXT_DROP] = "ip4-drop",
+ [IP4_FULL_REASS_NEXT_HANDOFF] = "ip4-local-full-reassembly-handoff",
+
+ },
+};
+
VLIB_NODE_FN (ip4_full_reass_node_feature) (vlib_main_t * vm,
vlib_node_runtime_t * node,
vlib_frame_t * frame)
{
- return ip4_full_reass_inline (vm, node, frame, FEATURE);
+ return ip4_full_reass_inline (vm, node, frame, FEATURE,
+ false /* is_local */);
}
VLIB_REGISTER_NODE (ip4_full_reass_node_feature) = {
.name = "ip4-full-reassembly-feature",
.vector_size = sizeof (u32),
.format_trace = format_ip4_full_reass_trace,
- .n_errors = ARRAY_LEN (ip4_full_reass_error_strings),
- .error_strings = ip4_full_reass_error_strings,
+ .n_errors = IP4_N_ERROR,
+ .error_counters = ip4_error_counters,
.n_next_nodes = IP4_FULL_REASS_N_NEXT,
.next_nodes =
{
@@ -1316,26 +1424,26 @@ VLIB_REGISTER_NODE (ip4_full_reass_node_feature) = {
};
VNET_FEATURE_INIT (ip4_full_reass_feature, static) = {
- .arc_name = "ip4-unicast",
- .node_name = "ip4-full-reassembly-feature",
- .runs_before = VNET_FEATURES ("ip4-lookup",
- "ipsec4-input-feature"),
- .runs_after = 0,
+ .arc_name = "ip4-unicast",
+ .node_name = "ip4-full-reassembly-feature",
+ .runs_before = VNET_FEATURES ("ip4-lookup", "ipsec4-input-feature",
+ "ip4-sv-reassembly-feature"),
+ .runs_after = 0,
};
VLIB_NODE_FN (ip4_full_reass_node_custom) (vlib_main_t * vm,
vlib_node_runtime_t * node,
vlib_frame_t * frame)
{
- return ip4_full_reass_inline (vm, node, frame, CUSTOM);
+ return ip4_full_reass_inline (vm, node, frame, CUSTOM, false /* is_local */);
}
VLIB_REGISTER_NODE (ip4_full_reass_node_custom) = {
.name = "ip4-full-reassembly-custom",
.vector_size = sizeof (u32),
.format_trace = format_ip4_full_reass_trace,
- .n_errors = ARRAY_LEN (ip4_full_reass_error_strings),
- .error_strings = ip4_full_reass_error_strings,
+ .n_errors = IP4_N_ERROR,
+ .error_counters = ip4_error_counters,
.n_next_nodes = IP4_FULL_REASS_N_NEXT,
.next_nodes =
{
@@ -1345,15 +1453,6 @@ VLIB_REGISTER_NODE (ip4_full_reass_node_custom) = {
},
};
-VNET_FEATURE_INIT (ip4_full_reass_custom, static) = {
- .arc_name = "ip4-unicast",
- .node_name = "ip4-full-reassembly-feature",
- .runs_before = VNET_FEATURES ("ip4-lookup",
- "ipsec4-input-feature"),
- .runs_after = 0,
-};
-
-
#ifndef CLIB_MARCH_VARIANT
uword
ip4_full_reass_custom_register_next_node (uword node_index)
@@ -1369,7 +1468,9 @@ ip4_full_reass_get_nbuckets ()
u32 nbuckets;
u8 i;
- nbuckets = (u32) (rm->max_reass_n / IP4_REASS_HT_LOAD_FACTOR);
+ /* need more mem with more workers */
+ nbuckets = (u32) (rm->max_reass_n * (vlib_num_workers () + 1) /
+ IP4_REASS_HT_LOAD_FACTOR);
for (i = 0; i < 31; i++)
if ((1 << i) >= nbuckets)
@@ -1495,17 +1596,17 @@ ip4_full_reass_init_function (vlib_main_t * vm)
nbuckets = ip4_full_reass_get_nbuckets ();
clib_bihash_init_16_8 (&rm->hash, "ip4-dr", nbuckets, nbuckets * 1024);
- node = vlib_get_node_by_name (vm, (u8 *) "ip4-drop");
- ASSERT (node);
- rm->ip4_drop_idx = node->index;
-
rm->fq_index = vlib_frame_queue_main_init (ip4_full_reass_node.index, 0);
+ rm->fq_local_index =
+ vlib_frame_queue_main_init (ip4_local_full_reass_node.index, 0);
rm->fq_feature_index =
vlib_frame_queue_main_init (ip4_full_reass_node_feature.index, 0);
rm->fq_custom_index =
vlib_frame_queue_main_init (ip4_full_reass_node_custom.index, 0);
rm->feature_use_refcount_per_intf = NULL;
+ rm->is_local_reass_enabled = 1;
+
return error;
}
@@ -1547,6 +1648,7 @@ ip4_full_reass_walk_expired (vlib_main_t *vm, vlib_node_runtime_t *node,
uword thread_index = 0;
int index;
const uword nthreads = vlib_num_workers () + 1;
+
for (thread_index = 0; thread_index < nthreads; ++thread_index)
{
ip4_full_reass_per_thread_t *rt =
@@ -1554,13 +1656,39 @@ ip4_full_reass_walk_expired (vlib_main_t *vm, vlib_node_runtime_t *node,
clib_spinlock_lock (&rt->lock);
vec_reset_length (pool_indexes_to_free);
- pool_foreach_index (index, rt->pool) {
- reass = pool_elt_at_index (rt->pool, index);
- if (now > reass->last_heard + rm->timeout)
- {
- vec_add1 (pool_indexes_to_free, index);
- }
- }
+
+ /* Pace the number of timeouts handled per thread,to avoid barrier
+ * sync issues in real world scenarios */
+
+ u32 beg = rt->last_id;
+ /* to ensure we walk at least once per sec per context */
+ u32 end =
+ beg + (IP4_REASS_MAX_REASSEMBLIES_DEFAULT *
+ IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS / MSEC_PER_SEC +
+ 1);
+ if (end > vec_len (rt->pool))
+ {
+ end = vec_len (rt->pool);
+ rt->last_id = 0;
+ }
+ else
+ {
+ rt->last_id = end;
+ }
+
+ pool_foreach_stepping_index (index, beg, end, rt->pool)
+ {
+ reass = pool_elt_at_index (rt->pool, index);
+ if (now > reass->last_heard + rm->timeout)
+ {
+ vec_add1 (pool_indexes_to_free, index);
+ }
+ }
+
+ if (vec_len (pool_indexes_to_free))
+ vlib_node_increment_counter (vm, node->node_index,
+ IP4_ERROR_REASS_TIMEOUT,
+ vec_len (pool_indexes_to_free));
int *i;
vec_foreach (i, pool_indexes_to_free)
{
@@ -1575,7 +1703,7 @@ ip4_full_reass_walk_expired (vlib_main_t *vm, vlib_node_runtime_t *node,
vec_free (pool_indexes_to_free);
if (event_data)
{
- _vec_len (event_data) = 0;
+ vec_set_len (event_data, 0);
}
}
@@ -1583,13 +1711,12 @@ ip4_full_reass_walk_expired (vlib_main_t *vm, vlib_node_runtime_t *node,
}
VLIB_REGISTER_NODE (ip4_full_reass_expire_node) = {
- .function = ip4_full_reass_walk_expired,
- .type = VLIB_NODE_TYPE_PROCESS,
- .name = "ip4-full-reassembly-expire-walk",
- .format_trace = format_ip4_full_reass_trace,
- .n_errors = ARRAY_LEN (ip4_full_reass_error_strings),
- .error_strings = ip4_full_reass_error_strings,
-
+ .function = ip4_full_reass_walk_expired,
+ .type = VLIB_NODE_TYPE_PROCESS,
+ .name = "ip4-full-reassembly-expire-walk",
+ .format_trace = format_ip4_full_reass_trace,
+ .n_errors = IP4_N_ERROR,
+ .error_counters = ip4_error_counters,
};
static u8 *
@@ -1597,9 +1724,8 @@ format_ip4_full_reass_key (u8 * s, va_list * args)
{
ip4_full_reass_key_t *key = va_arg (*args, ip4_full_reass_key_t *);
s =
- format (s,
- "xx_id: %u, src: %U, dst: %U, frag_id: %u, proto: %u",
- key->xx_id, format_ip4_address, &key->src, format_ip4_address,
+ format (s, "fib_index: %u, src: %U, dst: %U, frag_id: %u, proto: %u",
+ key->fib_index, format_ip4_address, &key->src, format_ip4_address,
&key->dst, clib_net_to_host_u16 (key->frag_id), key->proto);
return s;
}
@@ -1750,10 +1876,10 @@ format_ip4_full_reass_handoff_trace (u8 * s, va_list * args)
}
always_inline uword
-ip4_full_reass_handoff_node_inline (vlib_main_t * vm,
- vlib_node_runtime_t * node,
- vlib_frame_t * frame,
- ip4_full_reass_node_type_t type)
+ip4_full_reass_handoff_node_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
+ vlib_frame_t *frame,
+ ip4_full_reass_node_type_t type,
+ bool is_local)
{
ip4_full_reass_main_t *rm = &ip4_full_reass_main;
@@ -1772,7 +1898,14 @@ ip4_full_reass_handoff_node_inline (vlib_main_t * vm,
switch (type)
{
case NORMAL:
- fq_index = rm->fq_index;
+ if (is_local)
+ {
+ fq_index = rm->fq_local_index;
+ }
+ else
+ {
+ fq_index = rm->fq_index;
+ }
break;
case FEATURE:
fq_index = rm->fq_feature_index;
@@ -1782,7 +1915,6 @@ ip4_full_reass_handoff_node_inline (vlib_main_t * vm,
break;
default:
clib_warning ("Unexpected `type' (%d)!", type);
- ASSERT (0);
}
while (n_left_from > 0)
@@ -1816,7 +1948,8 @@ VLIB_NODE_FN (ip4_full_reass_handoff_node) (vlib_main_t * vm,
vlib_node_runtime_t * node,
vlib_frame_t * frame)
{
- return ip4_full_reass_handoff_node_inline (vm, node, frame, NORMAL);
+ return ip4_full_reass_handoff_node_inline (vm, node, frame, NORMAL,
+ false /* is_local */);
}
@@ -1834,16 +1967,36 @@ VLIB_REGISTER_NODE (ip4_full_reass_handoff_node) = {
},
};
+VLIB_NODE_FN (ip4_local_full_reass_handoff_node)
+(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
+{
+ return ip4_full_reass_handoff_node_inline (vm, node, frame, NORMAL,
+ true /* is_local */);
+}
+
+VLIB_REGISTER_NODE (ip4_local_full_reass_handoff_node) = {
+ .name = "ip4-local-full-reassembly-handoff",
+ .vector_size = sizeof (u32),
+ .n_errors = ARRAY_LEN(ip4_full_reass_handoff_error_strings),
+ .error_strings = ip4_full_reass_handoff_error_strings,
+ .format_trace = format_ip4_full_reass_handoff_trace,
+
+ .n_next_nodes = 1,
+
+ .next_nodes = {
+ [0] = "error-drop",
+ },
+};
VLIB_NODE_FN (ip4_full_reass_feature_handoff_node) (vlib_main_t * vm,
vlib_node_runtime_t *
node,
vlib_frame_t * frame)
{
- return ip4_full_reass_handoff_node_inline (vm, node, frame, FEATURE);
+ return ip4_full_reass_handoff_node_inline (vm, node, frame, FEATURE,
+ false /* is_local */);
}
-
VLIB_REGISTER_NODE (ip4_full_reass_feature_handoff_node) = {
.name = "ip4-full-reass-feature-hoff",
.vector_size = sizeof (u32),
@@ -1863,10 +2016,10 @@ VLIB_NODE_FN (ip4_full_reass_custom_handoff_node) (vlib_main_t * vm,
node,
vlib_frame_t * frame)
{
- return ip4_full_reass_handoff_node_inline (vm, node, frame, CUSTOM);
+ return ip4_full_reass_handoff_node_inline (vm, node, frame, CUSTOM,
+ false /* is_local */);
}
-
VLIB_REGISTER_NODE (ip4_full_reass_custom_handoff_node) = {
.name = "ip4-full-reass-custom-hoff",
.vector_size = sizeof (u32),
@@ -1906,8 +2059,28 @@ ip4_full_reass_enable_disable_with_refcnt (u32 sw_if_index, int is_enable)
"ip4-full-reassembly-feature",
sw_if_index, 0, 0, 0);
}
- return -1;
+ return 0;
}
+
+void
+ip4_local_full_reass_enable_disable (int enable)
+{
+ if (enable)
+ {
+ ip4_full_reass_main.is_local_reass_enabled = 1;
+ }
+ else
+ {
+ ip4_full_reass_main.is_local_reass_enabled = 0;
+ }
+}
+
+int
+ip4_local_full_reass_enabled ()
+{
+ return ip4_full_reass_main.is_local_reass_enabled;
+}
+
#endif
/*
diff --git a/src/vnet/ip/reass/ip4_full_reass.h b/src/vnet/ip/reass/ip4_full_reass.h
index 000c80c5906..5df8107ca48 100644
--- a/src/vnet/ip/reass/ip4_full_reass.h
+++ b/src/vnet/ip/reass/ip4_full_reass.h
@@ -47,6 +47,9 @@ int ip4_full_reass_enable_disable_with_refcnt (u32 sw_if_index,
int is_enable);
uword ip4_full_reass_custom_register_next_node (uword node_index);
+
+void ip4_local_full_reass_enable_disable (int enable);
+int ip4_local_full_reass_enabled ();
#endif /* __included_ip4_full_reass_h__ */
/*
diff --git a/src/vnet/ip/reass/ip4_sv_reass.c b/src/vnet/ip/reass/ip4_sv_reass.c
index cd5e19b65d3..7c3c2fff217 100644
--- a/src/vnet/ip/reass/ip4_sv_reass.c
+++ b/src/vnet/ip/reass/ip4_sv_reass.c
@@ -48,7 +48,7 @@ typedef struct
{
struct
{
- u32 xx_id;
+ u32 fib_index;
ip4_address_t src;
ip4_address_t dst;
u16 frag_id;
@@ -150,6 +150,7 @@ typedef struct
/** Worker handoff */
u32 fq_index;
u32 fq_feature_index;
+ u32 fq_custom_context_index;
// reference count for enabling/disabling feature - per interface
u32 *feature_use_refcount_per_intf;
@@ -189,6 +190,7 @@ typedef struct
u8 ip_proto;
u16 l4_src_port;
u16 l4_dst_port;
+ int l4_layer_truncated;
} ip4_sv_reass_trace_t;
extern vlib_node_registration_t ip4_sv_reass_node;
@@ -225,6 +227,10 @@ format_ip4_sv_reass_trace (u8 * s, va_list * args)
s = format (s, "[not-fragmented]");
break;
}
+ if (t->l4_layer_truncated)
+ {
+ s = format (s, " [l4-layer-truncated]");
+ }
return s;
}
@@ -232,7 +238,8 @@ static void
ip4_sv_reass_add_trace (vlib_main_t *vm, vlib_node_runtime_t *node,
ip4_sv_reass_t *reass, u32 bi,
ip4_sv_reass_trace_operation_e action, u32 ip_proto,
- u16 l4_src_port, u16 l4_dst_port)
+ u16 l4_src_port, u16 l4_dst_port,
+ int l4_layer_truncated)
{
vlib_buffer_t *b = vlib_get_buffer (vm, bi);
if (pool_is_free_index
@@ -253,6 +260,7 @@ ip4_sv_reass_add_trace (vlib_main_t *vm, vlib_node_runtime_t *node,
t->ip_proto = ip_proto;
t->l4_src_port = l4_src_port;
t->l4_dst_port = l4_dst_port;
+ t->l4_layer_truncated = l4_layer_truncated;
#if 0
static u8 *s = NULL;
s = format (s, "%U", format_ip4_sv_reass_trace, NULL, NULL, t);
@@ -314,6 +322,8 @@ ip4_sv_reass_find_or_create (vlib_main_t * vm, ip4_sv_reass_main_t * rm,
ip4_sv_reass_t *reass = NULL;
f64 now = vlib_time_now (vm);
+again:
+
if (!clib_bihash_search_16_8 (&rm->hash, &kv->kv, &kv->kv))
{
if (vm->thread_index != kv->v.thread_index)
@@ -368,10 +378,14 @@ ip4_sv_reass_find_or_create (vlib_main_t * vm, ip4_sv_reass_main_t * rm,
kv->v.thread_index = vm->thread_index;
reass->last_heard = now;
- if (clib_bihash_add_del_16_8 (&rm->hash, &kv->kv, 1))
+ int rv = clib_bihash_add_del_16_8 (&rm->hash, &kv->kv, 2);
+ if (rv)
{
ip4_sv_reass_free (vm, rm, rt, reass);
reass = NULL;
+ // if other worker created a context already work with the other copy
+ if (-2 == rv)
+ goto again;
}
return reass;
@@ -407,9 +421,10 @@ ip4_sv_reass_update (vlib_main_t *vm, vlib_node_runtime_t *node,
vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
{
- ip4_sv_reass_add_trace (vm, node, reass, bi0, REASS_FINISH,
- reass->ip_proto, reass->l4_src_port,
- reass->l4_dst_port);
+ ip4_sv_reass_add_trace (
+ vm, node, reass, bi0, REASS_FINISH, reass->ip_proto,
+ reass->l4_src_port, reass->l4_dst_port,
+ vnet_buffer (b0)->ip.reass.l4_layer_truncated);
}
}
vec_add1 (reass->cached_buffers, bi0);
@@ -417,8 +432,9 @@ ip4_sv_reass_update (vlib_main_t *vm, vlib_node_runtime_t *node,
{
if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
{
- ip4_sv_reass_add_trace (vm, node, reass, bi0, REASS_FRAGMENT_CACHE,
- ~0, ~0, ~0);
+ ip4_sv_reass_add_trace (
+ vm, node, reass, bi0, REASS_FRAGMENT_CACHE, ~0, ~0, ~0,
+ vnet_buffer (b0)->ip.reass.l4_layer_truncated);
}
if (vec_len (reass->cached_buffers) > rm->max_reass_len)
{
@@ -428,15 +444,33 @@ ip4_sv_reass_update (vlib_main_t *vm, vlib_node_runtime_t *node,
return rc;
}
+always_inline int
+l4_layer_truncated (ip4_header_t *ip)
+{
+ static const int l4_layer_length[256] = {
+ [IP_PROTOCOL_TCP] = sizeof (tcp_header_t),
+ [IP_PROTOCOL_UDP] = sizeof (udp_header_t),
+ [IP_PROTOCOL_ICMP] = sizeof (icmp46_header_t),
+ };
+
+ return ((u8 *) ip + ip4_header_bytes (ip) + l4_layer_length[ip->protocol] >
+ (u8 *) ip + clib_net_to_host_u16 (ip->length));
+}
+
always_inline uword
-ip4_sv_reass_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
- vlib_frame_t * frame, bool is_feature,
- bool is_output_feature, bool is_custom)
+ip4_sv_reass_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
+ vlib_frame_t *frame, bool is_feature,
+ bool is_output_feature, bool is_custom,
+ bool with_custom_context)
{
u32 *from = vlib_frame_vector_args (frame);
- u32 n_left_from, n_left_to_next, *to_next, next_index;
+ u32 n_left_from, n_left_to_next, *to_next, *to_next_aux, next_index;
ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
ip4_sv_reass_per_thread_t *rt = &rm->per_thread_data[vm->thread_index];
+ u32 *context;
+ if (with_custom_context)
+ context = vlib_frame_aux_args (frame);
+
clib_spinlock_lock (&rt->lock);
n_left_from = frame->n_vectors;
@@ -482,6 +516,7 @@ ip4_sv_reass_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
(is_output_feature ? 1 : 0) *
vnet_buffer (b1)->
ip.save_rewrite_length);
+
if (PREDICT_FALSE
(ip4_get_fragment_more (ip0) || ip4_get_fragment_offset (ip0))
|| (ip4_get_fragment_more (ip1) || ip4_get_fragment_offset (ip1)))
@@ -506,29 +541,40 @@ ip4_sv_reass_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
}
vnet_buffer (b0)->ip.reass.is_non_first_fragment = 0;
vnet_buffer (b0)->ip.reass.ip_proto = ip0->protocol;
- if (IP_PROTOCOL_TCP == ip0->protocol)
+ if (l4_layer_truncated (ip0))
{
- vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
- ((tcp_header_t *) (ip0 + 1))->flags;
- vnet_buffer (b0)->ip.reass.tcp_ack_number =
- ((tcp_header_t *) (ip0 + 1))->ack_number;
- vnet_buffer (b0)->ip.reass.tcp_seq_number =
- ((tcp_header_t *) (ip0 + 1))->seq_number;
+ vnet_buffer (b0)->ip.reass.l4_layer_truncated = 1;
+ vnet_buffer (b0)->ip.reass.l4_src_port = 0;
+ vnet_buffer (b0)->ip.reass.l4_dst_port = 0;
}
- else if (IP_PROTOCOL_ICMP == ip0->protocol)
+ else
{
- vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
- ((icmp46_header_t *) (ip0 + 1))->type;
+ vnet_buffer (b0)->ip.reass.l4_layer_truncated = 0;
+ if (IP_PROTOCOL_TCP == ip0->protocol)
+ {
+ vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
+ ((tcp_header_t *) (ip0 + 1))->flags;
+ vnet_buffer (b0)->ip.reass.tcp_ack_number =
+ ((tcp_header_t *) (ip0 + 1))->ack_number;
+ vnet_buffer (b0)->ip.reass.tcp_seq_number =
+ ((tcp_header_t *) (ip0 + 1))->seq_number;
+ }
+ else if (IP_PROTOCOL_ICMP == ip0->protocol)
+ {
+ vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
+ ((icmp46_header_t *) (ip0 + 1))->type;
+ }
+ vnet_buffer (b0)->ip.reass.l4_src_port = ip4_get_port (ip0, 1);
+ vnet_buffer (b0)->ip.reass.l4_dst_port = ip4_get_port (ip0, 0);
}
- vnet_buffer (b0)->ip.reass.l4_src_port = ip4_get_port (ip0, 1);
- vnet_buffer (b0)->ip.reass.l4_dst_port = ip4_get_port (ip0, 0);
if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
{
- ip4_sv_reass_add_trace (vm, node, NULL, from[(b - 2) - bufs],
- REASS_PASSTHROUGH,
- vnet_buffer (b0)->ip.reass.ip_proto,
- vnet_buffer (b0)->ip.reass.l4_src_port,
- vnet_buffer (b0)->ip.reass.l4_dst_port);
+ ip4_sv_reass_add_trace (
+ vm, node, NULL, from[(b - 2) - bufs], REASS_PASSTHROUGH,
+ vnet_buffer (b0)->ip.reass.ip_proto,
+ vnet_buffer (b0)->ip.reass.l4_src_port,
+ vnet_buffer (b0)->ip.reass.l4_dst_port,
+ vnet_buffer (b0)->ip.reass.l4_layer_truncated);
}
if (is_feature)
{
@@ -541,35 +587,48 @@ ip4_sv_reass_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
}
vnet_buffer (b1)->ip.reass.is_non_first_fragment = 0;
vnet_buffer (b1)->ip.reass.ip_proto = ip1->protocol;
- if (IP_PROTOCOL_TCP == ip1->protocol)
+ if (l4_layer_truncated (ip1))
{
- vnet_buffer (b1)->ip.reass.icmp_type_or_tcp_flags =
- ((tcp_header_t *) (ip1 + 1))->flags;
- vnet_buffer (b1)->ip.reass.tcp_ack_number =
- ((tcp_header_t *) (ip1 + 1))->ack_number;
- vnet_buffer (b1)->ip.reass.tcp_seq_number =
- ((tcp_header_t *) (ip1 + 1))->seq_number;
+ vnet_buffer (b1)->ip.reass.l4_layer_truncated = 1;
+ vnet_buffer (b1)->ip.reass.l4_src_port = 0;
+ vnet_buffer (b1)->ip.reass.l4_dst_port = 0;
}
- else if (IP_PROTOCOL_ICMP == ip1->protocol)
+ else
{
- vnet_buffer (b1)->ip.reass.icmp_type_or_tcp_flags =
- ((icmp46_header_t *) (ip1 + 1))->type;
+ vnet_buffer (b1)->ip.reass.l4_layer_truncated = 0;
+ if (IP_PROTOCOL_TCP == ip1->protocol)
+ {
+ vnet_buffer (b1)->ip.reass.icmp_type_or_tcp_flags =
+ ((tcp_header_t *) (ip1 + 1))->flags;
+ vnet_buffer (b1)->ip.reass.tcp_ack_number =
+ ((tcp_header_t *) (ip1 + 1))->ack_number;
+ vnet_buffer (b1)->ip.reass.tcp_seq_number =
+ ((tcp_header_t *) (ip1 + 1))->seq_number;
+ }
+ else if (IP_PROTOCOL_ICMP == ip1->protocol)
+ {
+ vnet_buffer (b1)->ip.reass.icmp_type_or_tcp_flags =
+ ((icmp46_header_t *) (ip1 + 1))->type;
+ }
+ vnet_buffer (b1)->ip.reass.l4_src_port = ip4_get_port (ip1, 1);
+ vnet_buffer (b1)->ip.reass.l4_dst_port = ip4_get_port (ip1, 0);
}
- vnet_buffer (b1)->ip.reass.l4_src_port = ip4_get_port (ip1, 1);
- vnet_buffer (b1)->ip.reass.l4_dst_port = ip4_get_port (ip1, 0);
if (PREDICT_FALSE (b1->flags & VLIB_BUFFER_IS_TRACED))
{
- ip4_sv_reass_add_trace (vm, node, NULL, from[(b - 1) - bufs],
- REASS_PASSTHROUGH,
- vnet_buffer (b1)->ip.reass.ip_proto,
- vnet_buffer (b1)->ip.reass.l4_src_port,
- vnet_buffer (b1)->ip.reass.l4_dst_port);
+ ip4_sv_reass_add_trace (
+ vm, node, NULL, from[(b - 1) - bufs], REASS_PASSTHROUGH,
+ vnet_buffer (b1)->ip.reass.ip_proto,
+ vnet_buffer (b1)->ip.reass.l4_src_port,
+ vnet_buffer (b1)->ip.reass.l4_dst_port,
+ vnet_buffer (b1)->ip.reass.l4_layer_truncated);
}
n_left_from -= 2;
next[0] = next0;
next[1] = next1;
next += 2;
+ if (with_custom_context)
+ context += 2;
}
while (n_left_from > 0)
@@ -608,34 +667,45 @@ ip4_sv_reass_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
}
vnet_buffer (b0)->ip.reass.is_non_first_fragment = 0;
vnet_buffer (b0)->ip.reass.ip_proto = ip0->protocol;
- if (IP_PROTOCOL_TCP == ip0->protocol)
+ if (l4_layer_truncated (ip0))
{
- vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
- ((tcp_header_t *) (ip0 + 1))->flags;
- vnet_buffer (b0)->ip.reass.tcp_ack_number =
- ((tcp_header_t *) (ip0 + 1))->ack_number;
- vnet_buffer (b0)->ip.reass.tcp_seq_number =
- ((tcp_header_t *) (ip0 + 1))->seq_number;
+ vnet_buffer (b0)->ip.reass.l4_layer_truncated = 1;
}
- else if (IP_PROTOCOL_ICMP == ip0->protocol)
+ else
{
- vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
- ((icmp46_header_t *) (ip0 + 1))->type;
+ vnet_buffer (b0)->ip.reass.l4_layer_truncated = 0;
+ if (IP_PROTOCOL_TCP == ip0->protocol)
+ {
+ vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
+ ((tcp_header_t *) (ip0 + 1))->flags;
+ vnet_buffer (b0)->ip.reass.tcp_ack_number =
+ ((tcp_header_t *) (ip0 + 1))->ack_number;
+ vnet_buffer (b0)->ip.reass.tcp_seq_number =
+ ((tcp_header_t *) (ip0 + 1))->seq_number;
+ }
+ else if (IP_PROTOCOL_ICMP == ip0->protocol)
+ {
+ vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
+ ((icmp46_header_t *) (ip0 + 1))->type;
+ }
+ vnet_buffer (b0)->ip.reass.l4_src_port = ip4_get_port (ip0, 1);
+ vnet_buffer (b0)->ip.reass.l4_dst_port = ip4_get_port (ip0, 0);
}
- vnet_buffer (b0)->ip.reass.l4_src_port = ip4_get_port (ip0, 1);
- vnet_buffer (b0)->ip.reass.l4_dst_port = ip4_get_port (ip0, 0);
if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
{
- ip4_sv_reass_add_trace (vm, node, NULL, from[(b - 1) - bufs],
- REASS_PASSTHROUGH,
- vnet_buffer (b0)->ip.reass.ip_proto,
- vnet_buffer (b0)->ip.reass.l4_src_port,
- vnet_buffer (b0)->ip.reass.l4_dst_port);
+ ip4_sv_reass_add_trace (
+ vm, node, NULL, from[(b - 1) - bufs], REASS_PASSTHROUGH,
+ vnet_buffer (b0)->ip.reass.ip_proto,
+ vnet_buffer (b0)->ip.reass.l4_src_port,
+ vnet_buffer (b0)->ip.reass.l4_dst_port,
+ vnet_buffer (b0)->ip.reass.l4_layer_truncated);
}
n_left_from -= 1;
next[0] = next0;
next += 1;
+ if (with_custom_context)
+ context += 1;
}
vlib_buffer_enqueue_to_next (vm, node, from, (u16 *) nexts,
@@ -649,7 +719,11 @@ slow_path:
while (n_left_from > 0)
{
- vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+ if (with_custom_context)
+ vlib_get_next_frame_with_aux_safe (vm, node, next_index, to_next,
+ to_next_aux, n_left_to_next);
+ else
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
while (n_left_from > 0 && n_left_to_next > 0)
{
@@ -657,6 +731,7 @@ slow_path:
vlib_buffer_t *b0;
u32 next0;
u32 error0 = IP4_ERROR_NONE;
+ u8 forward_context = 0;
bi0 = from[0];
b0 = vlib_get_buffer (vm, bi0);
@@ -679,29 +754,42 @@ slow_path:
}
vnet_buffer (b0)->ip.reass.is_non_first_fragment = 0;
vnet_buffer (b0)->ip.reass.ip_proto = ip0->protocol;
- if (IP_PROTOCOL_TCP == ip0->protocol)
+ if (l4_layer_truncated (ip0))
{
- vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
- ((tcp_header_t *) (ip0 + 1))->flags;
- vnet_buffer (b0)->ip.reass.tcp_ack_number =
- ((tcp_header_t *) (ip0 + 1))->ack_number;
- vnet_buffer (b0)->ip.reass.tcp_seq_number =
- ((tcp_header_t *) (ip0 + 1))->seq_number;
+ vnet_buffer (b0)->ip.reass.l4_layer_truncated = 1;
+ vnet_buffer (b0)->ip.reass.l4_src_port = 0;
+ vnet_buffer (b0)->ip.reass.l4_dst_port = 0;
}
- else if (IP_PROTOCOL_ICMP == ip0->protocol)
+ else
{
- vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
- ((icmp46_header_t *) (ip0 + 1))->type;
+ vnet_buffer (b0)->ip.reass.l4_layer_truncated = 0;
+ if (IP_PROTOCOL_TCP == ip0->protocol)
+ {
+ vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
+ ((tcp_header_t *) (ip0 + 1))->flags;
+ vnet_buffer (b0)->ip.reass.tcp_ack_number =
+ ((tcp_header_t *) (ip0 + 1))->ack_number;
+ vnet_buffer (b0)->ip.reass.tcp_seq_number =
+ ((tcp_header_t *) (ip0 + 1))->seq_number;
+ }
+ else if (IP_PROTOCOL_ICMP == ip0->protocol)
+ {
+ vnet_buffer (b0)->ip.reass.icmp_type_or_tcp_flags =
+ ((icmp46_header_t *) (ip0 + 1))->type;
+ }
+ vnet_buffer (b0)->ip.reass.l4_src_port =
+ ip4_get_port (ip0, 1);
+ vnet_buffer (b0)->ip.reass.l4_dst_port =
+ ip4_get_port (ip0, 0);
}
- vnet_buffer (b0)->ip.reass.l4_src_port = ip4_get_port (ip0, 1);
- vnet_buffer (b0)->ip.reass.l4_dst_port = ip4_get_port (ip0, 0);
if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
{
ip4_sv_reass_add_trace (
vm, node, NULL, bi0, REASS_PASSTHROUGH,
vnet_buffer (b0)->ip.reass.ip_proto,
vnet_buffer (b0)->ip.reass.l4_src_port,
- vnet_buffer (b0)->ip.reass.l4_dst_port);
+ vnet_buffer (b0)->ip.reass.l4_dst_port,
+ vnet_buffer (b0)->ip.reass.l4_layer_truncated);
}
goto packet_enqueue;
}
@@ -719,13 +807,17 @@ slow_path:
ip4_sv_reass_kv_t kv;
u8 do_handoff = 0;
- kv.k.as_u64[0] =
- (u64) vec_elt (ip4_main.fib_index_by_sw_if_index,
- vnet_buffer (b0)->sw_if_index[VLIB_RX]) |
- (u64) ip0->src_address.as_u32 << 32;
- kv.k.as_u64[1] =
- (u64) ip0->dst_address.
- as_u32 | (u64) ip0->fragment_id << 32 | (u64) ip0->protocol << 48;
+ if (with_custom_context)
+ kv.k.as_u64[0] = (u64) *context | (u64) ip0->src_address.as_u32
+ << 32;
+ else
+ kv.k.as_u64[0] =
+ (u64) vec_elt (ip4_main.fib_index_by_sw_if_index,
+ vnet_buffer (b0)->sw_if_index[VLIB_RX]) |
+ (u64) ip0->src_address.as_u32 << 32;
+ kv.k.as_u64[1] = (u64) ip0->dst_address.as_u32 |
+ (u64) ip0->fragment_id << 32 |
+ (u64) ip0->protocol << 48;
ip4_sv_reass_t *reass =
ip4_sv_reass_find_or_create (vm, rm, rt, &kv, &do_handoff);
@@ -735,6 +827,8 @@ slow_path:
next0 = IP4_SV_REASSEMBLY_NEXT_HANDOFF;
vnet_buffer (b0)->ip.reass.owner_thread_index =
kv.v.thread_index;
+ if (with_custom_context)
+ forward_context = 1;
goto packet_enqueue;
}
@@ -771,31 +865,32 @@ slow_path:
{
ip4_sv_reass_add_trace (
vm, node, reass, bi0, REASS_FRAGMENT_FORWARD,
- reass->ip_proto, reass->l4_src_port, reass->l4_dst_port);
+ reass->ip_proto, reass->l4_src_port, reass->l4_dst_port,
+ vnet_buffer (b0)->ip.reass.l4_layer_truncated);
}
goto packet_enqueue;
}
ip4_sv_reass_rc_t rc =
ip4_sv_reass_update (vm, node, rm, ip0, reass, bi0);
+ u32 counter = ~0;
switch (rc)
{
case IP4_SV_REASS_RC_OK:
/* nothing to do here */
break;
case IP4_SV_REASS_RC_TOO_MANY_FRAGMENTS:
- vlib_node_increment_counter (vm, node->node_index,
- IP4_ERROR_REASS_FRAGMENT_CHAIN_TOO_LONG,
- 1);
- ip4_sv_reass_free (vm, rm, rt, reass);
- goto next_packet;
+ counter = IP4_ERROR_REASS_FRAGMENT_CHAIN_TOO_LONG;
break;
case IP4_SV_REASS_RC_UNSUPP_IP_PROTO:
- vlib_node_increment_counter (vm, node->node_index,
- IP4_ERROR_REASS_UNSUPP_IP_PROT, 1);
+ counter = IP4_ERROR_REASS_UNSUPP_IP_PROT;
+ break;
+ }
+ if (~0 != counter)
+ {
+ vlib_node_increment_counter (vm, node->node_index, counter, 1);
ip4_sv_reass_free (vm, rm, rt, reass);
goto next_packet;
- break;
}
if (reass->is_complete)
{
@@ -843,13 +938,15 @@ slow_path:
{
ip4_sv_reass_add_trace (
vm, node, reass, bi0, REASS_FRAGMENT_FORWARD,
- reass->ip_proto, reass->l4_src_port, reass->l4_dst_port);
+ reass->ip_proto, reass->l4_src_port, reass->l4_dst_port,
+ vnet_buffer (b0)->ip.reass.l4_layer_truncated);
}
vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
to_next, n_left_to_next, bi0,
next0);
}
- _vec_len (reass->cached_buffers) = 0; // buffers are owned by frame now
+ vec_set_len (reass->cached_buffers,
+ 0); // buffers are owned by frame now
}
goto next_packet;
@@ -862,13 +959,26 @@ slow_path:
b0 = vlib_get_buffer (vm, bi0);
vnet_feature_next (&next0, b0);
}
- vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
- to_next, n_left_to_next,
- bi0, next0);
+ if (with_custom_context && forward_context)
+ {
+ if (to_next_aux)
+ {
+ to_next_aux[0] = *context;
+ to_next_aux += 1;
+ }
+ vlib_validate_buffer_enqueue_with_aux_x1 (
+ vm, node, next_index, to_next, to_next_aux, n_left_to_next,
+ bi0, *context, next0);
+ }
+ else
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
+ n_left_to_next, bi0, next0);
next_packet:
from += 1;
n_left_from -= 1;
+ if (with_custom_context)
+ context += 1;
}
vlib_put_next_frame (vm, node, next_index, n_left_to_next);
@@ -879,28 +989,21 @@ done:
return frame->n_vectors;
}
-static char *ip4_sv_reass_error_strings[] = {
-#define _(sym, string) string,
- foreach_ip4_error
-#undef _
-};
-
VLIB_NODE_FN (ip4_sv_reass_node) (vlib_main_t * vm,
vlib_node_runtime_t * node,
vlib_frame_t * frame)
{
- return ip4_sv_reass_inline (vm, node, frame, false /* is_feature */ ,
- false /* is_output_feature */ ,
- false /* is_custom */ );
+ return ip4_sv_reass_inline (
+ vm, node, frame, false /* is_feature */, false /* is_output_feature */,
+ false /* is_custom */, false /* with_custom_context */);
}
-/* *INDENT-OFF* */
VLIB_REGISTER_NODE (ip4_sv_reass_node) = {
.name = "ip4-sv-reassembly",
.vector_size = sizeof (u32),
.format_trace = format_ip4_sv_reass_trace,
- .n_errors = ARRAY_LEN (ip4_sv_reass_error_strings),
- .error_strings = ip4_sv_reass_error_strings,
+ .n_errors = IP4_N_ERROR,
+ .error_counters = ip4_error_counters,
.n_next_nodes = IP4_SV_REASSEMBLY_N_NEXT,
.next_nodes =
{
@@ -910,24 +1013,22 @@ VLIB_REGISTER_NODE (ip4_sv_reass_node) = {
},
};
-/* *INDENT-ON* */
VLIB_NODE_FN (ip4_sv_reass_node_feature) (vlib_main_t * vm,
vlib_node_runtime_t * node,
vlib_frame_t * frame)
{
- return ip4_sv_reass_inline (vm, node, frame, true /* is_feature */ ,
- false /* is_output_feature */ ,
- false /* is_custom */ );
+ return ip4_sv_reass_inline (
+ vm, node, frame, true /* is_feature */, false /* is_output_feature */,
+ false /* is_custom */, false /* with_custom_context */);
}
-/* *INDENT-OFF* */
VLIB_REGISTER_NODE (ip4_sv_reass_node_feature) = {
.name = "ip4-sv-reassembly-feature",
.vector_size = sizeof (u32),
.format_trace = format_ip4_sv_reass_trace,
- .n_errors = ARRAY_LEN (ip4_sv_reass_error_strings),
- .error_strings = ip4_sv_reass_error_strings,
+ .n_errors = IP4_N_ERROR,
+ .error_counters = ip4_error_counters,
.n_next_nodes = IP4_SV_REASSEMBLY_N_NEXT,
.next_nodes =
{
@@ -936,34 +1037,30 @@ VLIB_REGISTER_NODE (ip4_sv_reass_node_feature) = {
[IP4_SV_REASSEMBLY_NEXT_HANDOFF] = "ip4-sv-reass-feature-hoff",
},
};
-/* *INDENT-ON* */
-/* *INDENT-OFF* */
VNET_FEATURE_INIT (ip4_sv_reass_feature) = {
.arc_name = "ip4-unicast",
.node_name = "ip4-sv-reassembly-feature",
.runs_before = VNET_FEATURES ("ip4-lookup"),
.runs_after = 0,
};
-/* *INDENT-ON* */
VLIB_NODE_FN (ip4_sv_reass_node_output_feature) (vlib_main_t * vm,
vlib_node_runtime_t * node,
vlib_frame_t * frame)
{
- return ip4_sv_reass_inline (vm, node, frame, true /* is_feature */ ,
- true /* is_output_feature */ ,
- false /* is_custom */ );
+ return ip4_sv_reass_inline (
+ vm, node, frame, true /* is_feature */, true /* is_output_feature */,
+ false /* is_custom */, false /* with_custom_context */);
}
-/* *INDENT-OFF* */
VLIB_REGISTER_NODE (ip4_sv_reass_node_output_feature) = {
.name = "ip4-sv-reassembly-output-feature",
.vector_size = sizeof (u32),
.format_trace = format_ip4_sv_reass_trace,
- .n_errors = ARRAY_LEN (ip4_sv_reass_error_strings),
- .error_strings = ip4_sv_reass_error_strings,
+ .n_errors = IP4_N_ERROR,
+ .error_counters = ip4_error_counters,
.n_next_nodes = IP4_SV_REASSEMBLY_N_NEXT,
.next_nodes =
{
@@ -972,24 +1069,20 @@ VLIB_REGISTER_NODE (ip4_sv_reass_node_output_feature) = {
[IP4_SV_REASSEMBLY_NEXT_HANDOFF] = "ip4-sv-reass-feature-hoff",
},
};
-/* *INDENT-ON* */
-/* *INDENT-OFF* */
VNET_FEATURE_INIT (ip4_sv_reass_output_feature) = {
.arc_name = "ip4-output",
.node_name = "ip4-sv-reassembly-output-feature",
.runs_before = 0,
.runs_after = 0,
};
-/* *INDENT-ON* */
-/* *INDENT-OFF* */
VLIB_REGISTER_NODE (ip4_sv_reass_custom_node) = {
.name = "ip4-sv-reassembly-custom-next",
.vector_size = sizeof (u32),
.format_trace = format_ip4_sv_reass_trace,
- .n_errors = ARRAY_LEN (ip4_sv_reass_error_strings),
- .error_strings = ip4_sv_reass_error_strings,
+ .n_errors = IP4_N_ERROR,
+ .error_counters = ip4_error_counters,
.n_next_nodes = IP4_SV_REASSEMBLY_N_NEXT,
.next_nodes =
{
@@ -999,15 +1092,39 @@ VLIB_REGISTER_NODE (ip4_sv_reass_custom_node) = {
},
};
-/* *INDENT-ON* */
VLIB_NODE_FN (ip4_sv_reass_custom_node) (vlib_main_t * vm,
vlib_node_runtime_t * node,
vlib_frame_t * frame)
{
- return ip4_sv_reass_inline (vm, node, frame, false /* is_feature */ ,
- false /* is_output_feature */ ,
- true /* is_custom */ );
+ return ip4_sv_reass_inline (
+ vm, node, frame, false /* is_feature */, false /* is_output_feature */,
+ true /* is_custom */, false /* with_custom_context */);
+}
+
+VLIB_REGISTER_NODE (ip4_sv_reass_custom_context_node) = {
+ .name = "ip4-sv-reassembly-custom-context",
+ .vector_size = sizeof (u32),
+ .aux_size = sizeof(u32),
+ .format_trace = format_ip4_sv_reass_trace,
+ .n_errors = IP4_N_ERROR,
+ .error_counters = ip4_error_counters,
+ .n_next_nodes = IP4_SV_REASSEMBLY_N_NEXT,
+ .next_nodes =
+ {
+ [IP4_SV_REASSEMBLY_NEXT_INPUT] = "ip4-input",
+ [IP4_SV_REASSEMBLY_NEXT_DROP] = "ip4-drop",
+ [IP4_SV_REASSEMBLY_NEXT_HANDOFF] = "ip4-sv-reassembly-custom-context-handoff",
+
+ },
+};
+
+VLIB_NODE_FN (ip4_sv_reass_custom_context_node)
+(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
+{
+ return ip4_sv_reass_inline (
+ vm, node, frame, false /* is_feature */, false /* is_output_feature */,
+ true /* is_custom */, true /* with_custom_context */);
}
#ifndef CLIB_MARCH_VARIANT
@@ -1152,6 +1269,8 @@ ip4_sv_reass_init_function (vlib_main_t * vm)
rm->fq_index = vlib_frame_queue_main_init (ip4_sv_reass_node.index, 0);
rm->fq_feature_index =
vlib_frame_queue_main_init (ip4_sv_reass_node_feature.index, 0);
+ rm->fq_custom_context_index =
+ vlib_frame_queue_main_init (ip4_sv_reass_custom_context_node.index, 0);
rm->feature_use_refcount_per_intf = NULL;
rm->output_feature_use_refcount_per_intf = NULL;
@@ -1204,7 +1323,6 @@ ip4_sv_reass_walk_expired (vlib_main_t *vm,
clib_spinlock_lock (&rt->lock);
vec_reset_length (pool_indexes_to_free);
- /* *INDENT-OFF* */
pool_foreach_index (index, rt->pool) {
reass = pool_elt_at_index (rt->pool, index);
if (now > reass->last_heard + rm->timeout)
@@ -1212,15 +1330,12 @@ ip4_sv_reass_walk_expired (vlib_main_t *vm,
vec_add1 (pool_indexes_to_free, index);
}
}
- /* *INDENT-ON* */
int *i;
- /* *INDENT-OFF* */
vec_foreach (i, pool_indexes_to_free)
{
ip4_sv_reass_t *reass = pool_elt_at_index (rt->pool, i[0]);
ip4_sv_reass_free (vm, rm, rt, reass);
}
- /* *INDENT-ON* */
clib_spinlock_unlock (&rt->lock);
}
@@ -1228,33 +1343,29 @@ ip4_sv_reass_walk_expired (vlib_main_t *vm,
vec_free (pool_indexes_to_free);
if (event_data)
{
- _vec_len (event_data) = 0;
+ vec_set_len (event_data, 0);
}
}
return 0;
}
-/* *INDENT-OFF* */
VLIB_REGISTER_NODE (ip4_sv_reass_expire_node) = {
- .function = ip4_sv_reass_walk_expired,
- .type = VLIB_NODE_TYPE_PROCESS,
- .name = "ip4-sv-reassembly-expire-walk",
- .format_trace = format_ip4_sv_reass_trace,
- .n_errors = ARRAY_LEN (ip4_sv_reass_error_strings),
- .error_strings = ip4_sv_reass_error_strings,
-
+ .function = ip4_sv_reass_walk_expired,
+ .type = VLIB_NODE_TYPE_PROCESS,
+ .name = "ip4-sv-reassembly-expire-walk",
+ .format_trace = format_ip4_sv_reass_trace,
+ .n_errors = IP4_N_ERROR,
+ .error_counters = ip4_error_counters,
};
-/* *INDENT-ON* */
static u8 *
format_ip4_sv_reass_key (u8 * s, va_list * args)
{
ip4_sv_reass_key_t *key = va_arg (*args, ip4_sv_reass_key_t *);
s =
- format (s,
- "xx_id: %u, src: %U, dst: %U, frag_id: %u, proto: %u",
- key->xx_id, format_ip4_address, &key->src, format_ip4_address,
+ format (s, "fib_index: %u, src: %U, dst: %U, frag_id: %u, proto: %u",
+ key->fib_index, format_ip4_address, &key->src, format_ip4_address,
&key->dst, clib_net_to_host_u16 (key->frag_id), key->proto);
return s;
}
@@ -1313,11 +1424,9 @@ show_ip4_reass (vlib_main_t * vm,
clib_spinlock_lock (&rt->lock);
if (details)
{
- /* *INDENT-OFF* */
pool_foreach (reass, rt->pool) {
vlib_cli_output (vm, "%U", format_ip4_sv_reass, vm, reass);
}
- /* *INDENT-ON* */
}
sum_reass_n += rt->reass_n;
clib_spinlock_unlock (&rt->lock);
@@ -1341,13 +1450,11 @@ show_ip4_reass (vlib_main_t * vm,
return 0;
}
-/* *INDENT-OFF* */
VLIB_CLI_COMMAND (show_ip4_sv_reass_cmd, static) = {
.path = "show ip4-sv-reassembly",
.short_help = "show ip4-sv-reassembly [details]",
.function = show_ip4_reass,
};
-/* *INDENT-ON* */
#ifndef CLIB_MARCH_VARIANT
vnet_api_error_t
@@ -1398,25 +1505,30 @@ format_ip4_sv_reass_handoff_trace (u8 * s, va_list * args)
}
always_inline uword
-ip4_sv_reass_handoff_node_inline (vlib_main_t * vm,
- vlib_node_runtime_t * node,
- vlib_frame_t * frame, bool is_feature)
+ip4_sv_reass_handoff_node_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
+ vlib_frame_t *frame, bool is_feature,
+ bool is_custom_context)
{
ip4_sv_reass_main_t *rm = &ip4_sv_reass_main;
vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
- u32 n_enq, n_left_from, *from;
+ u32 n_enq, n_left_from, *from, *context;
u16 thread_indices[VLIB_FRAME_SIZE], *ti;
u32 fq_index;
from = vlib_frame_vector_args (frame);
+ if (is_custom_context)
+ context = vlib_frame_aux_args (frame);
+
n_left_from = frame->n_vectors;
vlib_get_buffers (vm, from, bufs, n_left_from);
b = bufs;
ti = thread_indices;
- fq_index = (is_feature) ? rm->fq_feature_index : rm->fq_index;
+ fq_index = (is_feature) ? rm->fq_feature_index :
+ (is_custom_context ? rm->fq_custom_context_index :
+ rm->fq_index);
while (n_left_from > 0)
{
@@ -1435,8 +1547,12 @@ ip4_sv_reass_handoff_node_inline (vlib_main_t * vm,
ti += 1;
b += 1;
}
- n_enq = vlib_buffer_enqueue_to_thread (vm, node, fq_index, from,
- thread_indices, frame->n_vectors, 1);
+ if (is_custom_context)
+ n_enq = vlib_buffer_enqueue_to_thread_with_aux (
+ vm, node, fq_index, from, context, thread_indices, frame->n_vectors, 1);
+ else
+ n_enq = vlib_buffer_enqueue_to_thread (
+ vm, node, fq_index, from, thread_indices, frame->n_vectors, 1);
if (n_enq < frame->n_vectors)
vlib_node_increment_counter (vm, node->node_index,
@@ -1449,12 +1565,11 @@ VLIB_NODE_FN (ip4_sv_reass_handoff_node) (vlib_main_t * vm,
vlib_node_runtime_t * node,
vlib_frame_t * frame)
{
- return ip4_sv_reass_handoff_node_inline (vm, node, frame,
- false /* is_feature */ );
+ return ip4_sv_reass_handoff_node_inline (
+ vm, node, frame, false /* is_feature */, false /* is_custom_context */);
}
-/* *INDENT-OFF* */
VLIB_REGISTER_NODE (ip4_sv_reass_handoff_node) = {
.name = "ip4-sv-reassembly-handoff",
.vector_size = sizeof (u32),
@@ -1468,22 +1583,39 @@ VLIB_REGISTER_NODE (ip4_sv_reass_handoff_node) = {
[0] = "error-drop",
},
};
-/* *INDENT-ON* */
+VLIB_NODE_FN (ip4_sv_reass_custom_context_handoff_node)
+(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
+{
+ return ip4_sv_reass_handoff_node_inline (
+ vm, node, frame, false /* is_feature */, true /* is_custom_context */);
+}
+
+VLIB_REGISTER_NODE (ip4_sv_reass_custom_context_handoff_node) = {
+ .name = "ip4-sv-reassembly-custom-context-handoff",
+ .vector_size = sizeof (u32),
+ .aux_size = sizeof (u32),
+ .n_errors = ARRAY_LEN(ip4_sv_reass_handoff_error_strings),
+ .error_strings = ip4_sv_reass_handoff_error_strings,
+ .format_trace = format_ip4_sv_reass_handoff_trace,
+
+ .n_next_nodes = 1,
+
+ .next_nodes = {
+ [0] = "error-drop",
+ },
+};
-/* *INDENT-OFF* */
VLIB_NODE_FN (ip4_sv_reass_feature_handoff_node) (vlib_main_t * vm,
vlib_node_runtime_t *
node,
vlib_frame_t * frame)
{
- return ip4_sv_reass_handoff_node_inline (vm, node, frame,
- true /* is_feature */ );
+ return ip4_sv_reass_handoff_node_inline (
+ vm, node, frame, true /* is_feature */, false /* is_custom_context */);
}
-/* *INDENT-ON* */
-/* *INDENT-OFF* */
VLIB_REGISTER_NODE (ip4_sv_reass_feature_handoff_node) = {
.name = "ip4-sv-reass-feature-hoff",
.vector_size = sizeof (u32),
@@ -1497,7 +1629,6 @@ VLIB_REGISTER_NODE (ip4_sv_reass_feature_handoff_node) = {
[0] = "error-drop",
},
};
-/* *INDENT-ON* */
#ifndef CLIB_MARCH_VARIANT
int
@@ -1535,6 +1666,13 @@ ip4_sv_reass_custom_register_next_node (uword node_index)
node_index);
}
+uword
+ip4_sv_reass_custom_context_register_next_node (uword node_index)
+{
+ return vlib_node_add_next (
+ vlib_get_main (), ip4_sv_reass_custom_context_node.index, node_index);
+}
+
int
ip4_sv_reass_output_enable_disable_with_refcnt (u32 sw_if_index,
int is_enable)
diff --git a/src/vnet/ip/reass/ip4_sv_reass.h b/src/vnet/ip/reass/ip4_sv_reass.h
index e926dbeebcc..3a684eb9809 100644
--- a/src/vnet/ip/reass/ip4_sv_reass.h
+++ b/src/vnet/ip/reass/ip4_sv_reass.h
@@ -49,6 +49,7 @@ int ip4_sv_reass_output_enable_disable_with_refcnt (u32 sw_if_index,
int is_enable);
uword ip4_sv_reass_custom_register_next_node (uword node_index);
+uword ip4_sv_reass_custom_context_register_next_node (uword node_index);
#endif /* __included_ip4_sv_reass_h__ */
diff --git a/src/vnet/ip/reass/ip6_full_reass.c b/src/vnet/ip/reass/ip6_full_reass.c
index 9ec40cd347c..27647985877 100644
--- a/src/vnet/ip/reass/ip6_full_reass.c
+++ b/src/vnet/ip/reass/ip6_full_reass.c
@@ -25,10 +25,14 @@
#include <vnet/ip/ip.h>
#include <vppinfra/bihash_48_8.h>
#include <vnet/ip/reass/ip6_full_reass.h>
+#include <vnet/ip/ip6_inlines.h>
#define MSEC_PER_SEC 1000
-#define IP6_FULL_REASS_TIMEOUT_DEFAULT_MS 100
-#define IP6_FULL_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS 10000 // 10 seconds default
+#define IP6_FULL_REASS_TIMEOUT_DEFAULT_MS 200
+/* As there are only 1024 reass context per thread, either the DDOS attacks
+ * or fractions of real timeouts, would consume these contexts quickly and
+ * running out context space and unable to perform reassembly */
+#define IP6_FULL_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS 50 // 50 ms default
#define IP6_FULL_REASS_MAX_REASSEMBLIES_DEFAULT 1024
#define IP6_FULL_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT 3
#define IP6_FULL_REASS_HT_LOAD_FACTOR (0.75)
@@ -40,6 +44,8 @@ typedef enum
IP6_FULL_REASS_RC_TOO_MANY_FRAGMENTS,
IP6_FULL_REASS_RC_NO_BUF,
IP6_FULL_REASS_RC_HANDOFF,
+ IP6_FULL_REASS_RC_INVALID_FRAG_LEN,
+ IP6_FULL_REASS_RC_OVERLAP,
} ip6_full_reass_rc_t;
typedef struct
@@ -132,6 +138,8 @@ typedef struct
ip6_full_reass_t *pool;
u32 reass_n;
u32 id_counter;
+ // for pacing the main thread timeouts
+ u32 last_id;
clib_spinlock_t lock;
} ip6_full_reass_per_thread_t;
@@ -155,17 +163,20 @@ typedef struct
// convenience
vlib_main_t *vlib_main;
- // node index of ip6-drop node
- u32 ip6_drop_idx;
u32 ip6_icmp_error_idx;
u32 ip6_full_reass_expire_node_idx;
/** Worker handoff */
u32 fq_index;
+ u32 fq_local_index;
u32 fq_feature_index;
+ u32 fq_custom_index;
// reference count for enabling/disabling feature - per interface
u32 *feature_use_refcount_per_intf;
+
+ // whether local fragmented packets are reassembled or not
+ int is_local_reass_enabled;
} ip6_full_reass_main_t;
extern ip6_full_reass_main_t ip6_full_reass_main;
@@ -185,13 +196,22 @@ typedef enum
typedef enum
{
+ NORMAL,
+ FEATURE,
+ CUSTOM
+} ip6_full_reass_node_type_t;
+
+typedef enum
+{
RANGE_NEW,
+ RANGE_DISCARD,
RANGE_OVERLAP,
ICMP_ERROR_RT_EXCEEDED,
ICMP_ERROR_FL_TOO_BIG,
ICMP_ERROR_FL_NOT_MULT_8,
FINALIZE,
HANDOFF,
+ PASSTHROUGH,
} ip6_full_reass_trace_operation_e;
typedef struct
@@ -278,6 +298,10 @@ format_ip6_full_reass_trace (u8 * s, va_list * args)
s = format (s, "\n%Unew %U", format_white_space, indent,
format_ip6_full_reass_range_trace, &t->trace_range);
break;
+ case RANGE_DISCARD:
+ s = format (s, "\n%Udiscard %U", format_white_space, indent,
+ format_ip6_full_reass_range_trace, &t->trace_range);
+ break;
case RANGE_OVERLAP:
s = format (s, "\n%Uoverlap %U", format_white_space, indent,
format_ip6_full_reass_range_trace, &t->trace_range);
@@ -304,6 +328,9 @@ format_ip6_full_reass_trace (u8 * s, va_list * args)
format (s, "handoff from thread #%u to thread #%u", t->thread_id,
t->thread_id_to);
break;
+ case PASSTHROUGH:
+ s = format (s, "passthrough - not a fragment");
+ break;
}
return s;
}
@@ -396,59 +423,69 @@ ip6_full_reass_free (ip6_full_reass_main_t * rm,
ip6_full_reass_free_ctx (rt, reass);
}
+/* n_left_to_next, and to_next are taken as input params, as this function
+ * could be called from a graphnode, where its managing local copy of these
+ * variables, and ignoring those and still trying to enqueue the buffers
+ * with local variables would cause either buffer leak or corruption */
always_inline void
ip6_full_reass_drop_all (vlib_main_t *vm, vlib_node_runtime_t *node,
- ip6_full_reass_t *reass)
+ ip6_full_reass_t *reass, u32 *n_left_to_next,
+ u32 **to_next)
{
u32 range_bi = reass->first_bi;
vlib_buffer_t *range_b;
vnet_buffer_opaque_t *range_vnb;
u32 *to_free = NULL;
+
while (~0 != range_bi)
{
range_b = vlib_get_buffer (vm, range_bi);
range_vnb = vnet_buffer (range_b);
- u32 bi = range_bi;
- while (~0 != bi)
+
+ if (~0 != range_bi)
{
- vec_add1 (to_free, bi);
- vlib_buffer_t *b = vlib_get_buffer (vm, bi);
- if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
- {
- bi = b->next_buffer;
- b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
- }
- else
- {
- bi = ~0;
- }
+ vec_add1 (to_free, range_bi);
}
range_bi = range_vnb->ip.reass.next_range_bi;
}
+
/* send to next_error_index */
- if (~0 != reass->error_next_index)
+ if (~0 != reass->error_next_index &&
+ reass->error_next_index < node->n_next_nodes)
{
- u32 n_left_to_next, *to_next, next_index;
+ u32 next_index;
next_index = reass->error_next_index;
u32 bi = ~0;
+ /* record number of packets sent to custom app */
+ vlib_node_increment_counter (vm, node->node_index,
+ IP6_ERROR_REASS_TO_CUSTOM_APP,
+ vec_len (to_free));
+
while (vec_len (to_free) > 0)
{
- vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+ vlib_get_next_frame (vm, node, next_index, *to_next,
+ (*n_left_to_next));
- while (vec_len (to_free) > 0 && n_left_to_next > 0)
+ while (vec_len (to_free) > 0 && (*n_left_to_next) > 0)
{
bi = vec_pop (to_free);
if (~0 != bi)
{
- to_next[0] = bi;
- to_next += 1;
- n_left_to_next -= 1;
+ vlib_buffer_t *b = vlib_get_buffer (vm, bi);
+ if (PREDICT_FALSE (b->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ ip6_full_reass_add_trace (vm, node, reass, bi, NULL,
+ RANGE_DISCARD, ~0);
+ }
+ *to_next[0] = bi;
+ (*to_next) += 1;
+ (*n_left_to_next) -= 1;
}
}
- vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ vlib_put_next_frame (vm, node, next_index, (*n_left_to_next));
}
}
else
@@ -459,8 +496,65 @@ ip6_full_reass_drop_all (vlib_main_t *vm, vlib_node_runtime_t *node,
}
always_inline void
-ip6_full_reass_on_timeout (vlib_main_t * vm, vlib_node_runtime_t * node,
- ip6_full_reass_t * reass, u32 * icmp_bi)
+sanitize_reass_buffers_add_missing (vlib_main_t *vm, ip6_full_reass_t *reass,
+ u32 *bi0)
+{
+ u32 range_bi = reass->first_bi;
+ vlib_buffer_t *range_b;
+ vnet_buffer_opaque_t *range_vnb;
+
+ while (~0 != range_bi)
+ {
+ range_b = vlib_get_buffer (vm, range_bi);
+ range_vnb = vnet_buffer (range_b);
+ u32 bi = range_bi;
+ if (~0 != bi)
+ {
+ if (bi == *bi0)
+ *bi0 = ~0;
+ if (range_b->flags & VLIB_BUFFER_NEXT_PRESENT)
+ {
+ u32 _bi = bi;
+ vlib_buffer_t *_b = vlib_get_buffer (vm, _bi);
+ while (_b->flags & VLIB_BUFFER_NEXT_PRESENT)
+ {
+ if (_b->next_buffer != range_vnb->ip.reass.next_range_bi)
+ {
+ _bi = _b->next_buffer;
+ _b = vlib_get_buffer (vm, _bi);
+ }
+ else
+ {
+ _b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
+ break;
+ }
+ }
+ }
+ range_bi = range_vnb->ip.reass.next_range_bi;
+ }
+ }
+ if (*bi0 != ~0)
+ {
+ vlib_buffer_t *fb = vlib_get_buffer (vm, *bi0);
+ vnet_buffer_opaque_t *fvnb = vnet_buffer (fb);
+ if (~0 != reass->first_bi)
+ {
+ fvnb->ip.reass.next_range_bi = reass->first_bi;
+ reass->first_bi = *bi0;
+ }
+ else
+ {
+ reass->first_bi = *bi0;
+ fvnb->ip.reass.next_range_bi = ~0;
+ }
+ *bi0 = ~0;
+ }
+}
+
+always_inline void
+ip6_full_reass_on_timeout (vlib_main_t *vm, vlib_node_runtime_t *node,
+ ip6_full_reass_t *reass, u32 *icmp_bi,
+ u32 *n_left_to_next, u32 **to_next)
{
if (~0 == reass->first_bi)
{
@@ -493,15 +587,16 @@ ip6_full_reass_on_timeout (vlib_main_t * vm, vlib_node_runtime_t * node,
0);
}
}
- ip6_full_reass_drop_all (vm, node, reass);
+ ip6_full_reass_drop_all (vm, node, reass, n_left_to_next, to_next);
}
always_inline ip6_full_reass_t *
-ip6_full_reass_find_or_create (vlib_main_t * vm, vlib_node_runtime_t * node,
- ip6_full_reass_main_t * rm,
- ip6_full_reass_per_thread_t * rt,
- ip6_full_reass_kv_t * kv, u32 * icmp_bi,
- u8 * do_handoff)
+ip6_full_reass_find_or_create (vlib_main_t *vm, vlib_node_runtime_t *node,
+ ip6_full_reass_main_t *rm,
+ ip6_full_reass_per_thread_t *rt,
+ ip6_full_reass_kv_t *kv, u32 *icmp_bi,
+ u8 *do_handoff, int skip_bihash,
+ u32 *n_left_to_next, u32 **to_next)
{
ip6_full_reass_t *reass;
f64 now;
@@ -511,7 +606,7 @@ again:
reass = NULL;
now = vlib_time_now (vm);
- if (!clib_bihash_search_48_8 (&rm->hash, &kv->kv, &kv->kv))
+ if (!skip_bihash && !clib_bihash_search_48_8 (&rm->hash, &kv->kv, &kv->kv))
{
if (vm->thread_index != kv->v.memory_owner_thread_index)
{
@@ -526,7 +621,10 @@ again:
if (now > reass->last_heard + rm->timeout)
{
- ip6_full_reass_on_timeout (vm, node, reass, icmp_bi);
+ vlib_node_increment_counter (vm, node->node_index,
+ IP6_ERROR_REASS_TIMEOUT, 1);
+ ip6_full_reass_on_timeout (vm, node, reass, icmp_bi, n_left_to_next,
+ to_next);
ip6_full_reass_free (rm, rt, reass);
reass = NULL;
}
@@ -554,27 +652,41 @@ again:
reass->data_len = 0;
reass->next_index = ~0;
reass->error_next_index = ~0;
+ reass->memory_owner_thread_index = vm->thread_index;
++rt->reass_n;
}
- reass->key.as_u64[0] = kv->kv.key[0];
- reass->key.as_u64[1] = kv->kv.key[1];
- reass->key.as_u64[2] = kv->kv.key[2];
- reass->key.as_u64[3] = kv->kv.key[3];
- reass->key.as_u64[4] = kv->kv.key[4];
- reass->key.as_u64[5] = kv->kv.key[5];
kv->v.reass_index = (reass - rt->pool);
kv->v.memory_owner_thread_index = vm->thread_index;
reass->last_heard = now;
- int rv = clib_bihash_add_del_48_8 (&rm->hash, &kv->kv, 2);
- if (rv)
+ if (!skip_bihash)
{
- ip6_full_reass_free (rm, rt, reass);
- reass = NULL;
- // if other worker created a context already work with the other copy
- if (-2 == rv)
- goto again;
+ reass->key.as_u64[0] = kv->kv.key[0];
+ reass->key.as_u64[1] = kv->kv.key[1];
+ reass->key.as_u64[2] = kv->kv.key[2];
+ reass->key.as_u64[3] = kv->kv.key[3];
+ reass->key.as_u64[4] = kv->kv.key[4];
+ reass->key.as_u64[5] = kv->kv.key[5];
+
+ int rv = clib_bihash_add_del_48_8 (&rm->hash, &kv->kv, 2);
+ if (rv)
+ {
+ ip6_full_reass_free (rm, rt, reass);
+ reass = NULL;
+ // if other worker created a context already work with the other copy
+ if (-2 == rv)
+ goto again;
+ }
+ }
+ else
+ {
+ reass->key.as_u64[0] = ~0;
+ reass->key.as_u64[1] = ~0;
+ reass->key.as_u64[2] = ~0;
+ reass->key.as_u64[3] = ~0;
+ reass->key.as_u64[4] = ~0;
+ reass->key.as_u64[5] = ~0;
}
return reass;
@@ -593,8 +705,6 @@ ip6_full_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
vlib_buffer_t *last_b = NULL;
u32 sub_chain_bi = reass->first_bi;
u32 total_length = 0;
- u32 buf_cnt = 0;
- u32 dropped_cnt = 0;
u32 *vec_drop_compress = NULL;
ip6_full_reass_rc_t rv = IP6_FULL_REASS_RC_OK;
do
@@ -636,19 +746,18 @@ ip6_full_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
vlib_buffer_length_in_chain (vm, tmp) - trim_front - trim_end;
while (1)
{
- ++buf_cnt;
if (trim_front)
{
if (trim_front > tmp->current_length)
{
/* drop whole buffer */
- vec_add1 (vec_drop_compress, tmp_bi);
- trim_front -= tmp->current_length;
if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
{
rv = IP6_FULL_REASS_RC_INTERNAL_ERROR;
goto free_buffers_and_return;
}
+ trim_front -= tmp->current_length;
+ vec_add1 (vec_drop_compress, tmp_bi);
tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
tmp_bi = tmp->next_buffer;
tmp = vlib_get_buffer (vm, tmp_bi);
@@ -686,13 +795,12 @@ ip6_full_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
}
else
{
- vec_add1 (vec_drop_compress, tmp_bi);
if (reass->first_bi == tmp_bi)
{
rv = IP6_FULL_REASS_RC_INTERNAL_ERROR;
goto free_buffers_and_return;
}
- ++dropped_cnt;
+ vec_add1 (vec_drop_compress, tmp_bi);
}
if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
{
@@ -729,19 +837,27 @@ ip6_full_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
vnet_buffer_opaque_t *first_b_vnb = vnet_buffer (first_b);
ip6_header_t *ip = vlib_buffer_get_current (first_b);
u16 ip6_frag_hdr_offset = first_b_vnb->ip.reass.ip6_frag_hdr_offset;
- ip6_ext_header_t *prev_hdr;
- frag_hdr =
- ip6_ext_header_find (vm, first_b, ip, IP_PROTOCOL_IPV6_FRAGMENTATION,
- &prev_hdr);
- if (prev_hdr)
+ ip6_ext_hdr_chain_t hdr_chain;
+ ip6_ext_header_t *prev_hdr = 0;
+ int res = ip6_ext_header_walk (first_b, ip, IP_PROTOCOL_IPV6_FRAGMENTATION,
+ &hdr_chain);
+ if (res < 0 ||
+ (hdr_chain.eh[res].protocol != IP_PROTOCOL_IPV6_FRAGMENTATION))
{
+ rv = IP6_FULL_REASS_RC_INTERNAL_ERROR;
+ goto free_buffers_and_return;
+ }
+ frag_hdr = ip6_ext_next_header_offset (ip, hdr_chain.eh[res].offset);
+ if (res > 0)
+ {
+ prev_hdr = ip6_ext_next_header_offset (ip, hdr_chain.eh[res - 1].offset);
prev_hdr->next_hdr = frag_hdr->next_hdr;
}
else
{
ip->protocol = frag_hdr->next_hdr;
}
- if (!((u8 *) frag_hdr - (u8 *) ip == ip6_frag_hdr_offset))
+ if (hdr_chain.eh[res].offset != ip6_frag_hdr_offset)
{
rv = IP6_FULL_REASS_RC_INTERNAL_ERROR;
goto free_buffers_and_return;
@@ -799,6 +915,15 @@ ip6_full_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
*next0 = reass->next_index;
}
vnet_buffer (first_b)->ip.reass.estimated_mtu = reass->min_fragment_length;
+ /* Keep track of number of successfully reassembled packets and number of
+ * fragments reassembled */
+ vlib_node_increment_counter (vm, node->node_index, IP6_ERROR_REASS_SUCCESS,
+ 1);
+
+ vlib_node_increment_counter (vm, node->node_index,
+ IP6_ERROR_REASS_FRAGMENTS_REASSEMBLED,
+ reass->fragments_n);
+
ip6_full_reass_free (rm, rt, reass);
reass = NULL;
free_buffers_and_return:
@@ -834,12 +959,13 @@ ip6_full_reass_insert_range_in_chain (vlib_main_t * vm,
}
always_inline ip6_full_reass_rc_t
-ip6_full_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node,
- ip6_full_reass_main_t * rm,
- ip6_full_reass_per_thread_t * rt,
- ip6_full_reass_t * reass, u32 * bi0, u32 * next0,
- u32 * error0, ip6_frag_hdr_t * frag_hdr,
- bool is_custom_app, u32 * handoff_thread_idx)
+ip6_full_reass_update (vlib_main_t *vm, vlib_node_runtime_t *node,
+ ip6_full_reass_main_t *rm,
+ ip6_full_reass_per_thread_t *rt,
+ ip6_full_reass_t *reass, u32 *bi0, u32 *next0,
+ u32 *error0, ip6_frag_hdr_t *frag_hdr,
+ bool is_custom_app, u32 *handoff_thread_idx,
+ int skip_bihash)
{
int consumed = 0;
vlib_buffer_t *fb = vlib_get_buffer (vm, *bi0);
@@ -865,6 +991,10 @@ ip6_full_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node,
u32 fragment_length =
vlib_buffer_length_in_chain (vm, fb) -
(fvnb->ip.reass.ip6_frag_hdr_offset + sizeof (*frag_hdr));
+ if (0 == fragment_length)
+ {
+ return IP6_FULL_REASS_RC_INVALID_FRAG_LEN;
+ }
u32 fragment_last = fvnb->ip.reass.fragment_last =
fragment_first + fragment_length - 1;
int more_fragments = ip6_frag_hdr_more (frag_hdr);
@@ -929,11 +1059,7 @@ ip6_full_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node,
ip6_full_reass_add_trace (vm, node, reass, *bi0, frag_hdr,
RANGE_OVERLAP, ~0);
}
- ip6_full_reass_drop_all (vm, node, reass);
- ip6_full_reass_free (rm, rt, reass);
- *next0 = IP6_FULL_REASSEMBLY_NEXT_DROP;
- *error0 = IP6_ERROR_REASS_OVERLAPPING_FRAGMENT;
- return IP6_FULL_REASS_RC_OK;
+ return IP6_FULL_REASS_RC_OVERLAP;
}
break;
}
@@ -947,6 +1073,12 @@ check_if_done_maybe:
~0);
}
}
+ else if (skip_bihash)
+ {
+ // if this reassembly is not in bihash, then the packet must have been
+ // consumed
+ return IP6_FULL_REASS_RC_INTERNAL_ERROR;
+ }
if (~0 != reass->last_packet_octet &&
reass->data_len == reass->last_packet_octet + 1)
{
@@ -964,6 +1096,12 @@ check_if_done_maybe:
}
else
{
+ if (skip_bihash)
+ {
+ // if this reassembly is not in bihash, it should've been an atomic
+ // fragment and thus finalized
+ return IP6_FULL_REASS_RC_INTERNAL_ERROR;
+ }
if (consumed)
{
*bi0 = ~0;
@@ -982,31 +1120,28 @@ check_if_done_maybe:
}
always_inline bool
-ip6_full_reass_verify_upper_layer_present (vlib_node_runtime_t * node,
- vlib_buffer_t * b,
- ip6_frag_hdr_t * frag_hdr)
+ip6_full_reass_verify_upper_layer_present (vlib_node_runtime_t *node,
+ vlib_buffer_t *b,
+ ip6_ext_hdr_chain_t *hc)
{
- ip6_ext_header_t *tmp = (ip6_ext_header_t *) frag_hdr;
- while (ip6_ext_hdr (tmp->next_hdr))
- {
- tmp = ip6_ext_next_header (tmp);
- }
- if (IP_PROTOCOL_IP6_NONXT == tmp->next_hdr)
+ int nh = hc->eh[hc->length - 1].protocol;
+ /* Checking to see if it's a terminating header */
+ if (ip6_ext_hdr (nh))
{
- icmp6_error_set_vnet_buffer (b, ICMP6_parameter_problem,
- ICMP6_parameter_problem_first_fragment_has_incomplete_header_chain,
- 0);
+ icmp6_error_set_vnet_buffer (
+ b, ICMP6_parameter_problem,
+ ICMP6_parameter_problem_first_fragment_has_incomplete_header_chain, 0);
b->error = node->errors[IP6_ERROR_REASS_MISSING_UPPER];
-
return false;
}
return true;
}
always_inline bool
-ip6_full_reass_verify_fragment_multiple_8 (vlib_main_t * vm,
- vlib_buffer_t * b,
- ip6_frag_hdr_t * frag_hdr)
+ip6_full_reass_verify_fragment_multiple_8 (vlib_main_t *vm,
+ vlib_node_runtime_t *node,
+ vlib_buffer_t *b,
+ ip6_frag_hdr_t *frag_hdr)
{
vnet_buffer_opaque_t *vnb = vnet_buffer (b);
ip6_header_t *ip = vlib_buffer_get_current (b);
@@ -1019,15 +1154,17 @@ ip6_full_reass_verify_fragment_multiple_8 (vlib_main_t * vm,
icmp6_error_set_vnet_buffer (b, ICMP6_parameter_problem,
ICMP6_parameter_problem_erroneous_header_field,
(u8 *) & ip->payload_length - (u8 *) ip);
+ b->error = node->errors[IP6_ERROR_REASS_INVALID_FRAG_SIZE];
return false;
}
return true;
}
always_inline bool
-ip6_full_reass_verify_packet_size_lt_64k (vlib_main_t * vm,
- vlib_buffer_t * b,
- ip6_frag_hdr_t * frag_hdr)
+ip6_full_reass_verify_packet_size_lt_64k (vlib_main_t *vm,
+ vlib_node_runtime_t *node,
+ vlib_buffer_t *b,
+ ip6_frag_hdr_t *frag_hdr)
{
vnet_buffer_opaque_t *vnb = vnet_buffer (b);
u32 fragment_first = ip6_frag_hdr_offset_bytes (frag_hdr);
@@ -1041,16 +1178,16 @@ ip6_full_reass_verify_packet_size_lt_64k (vlib_main_t * vm,
ICMP6_parameter_problem_erroneous_header_field,
(u8 *) & frag_hdr->fragment_offset_and_more
- (u8 *) ip0);
+ b->error = node->errors[IP6_ERROR_REASS_INVALID_FRAG_SIZE];
return false;
}
return true;
}
always_inline uword
-ip6_full_reassembly_inline (vlib_main_t * vm,
- vlib_node_runtime_t * node,
- vlib_frame_t * frame, bool is_feature,
- bool is_custom_app)
+ip6_full_reassembly_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
+ vlib_frame_t *frame, bool is_feature,
+ bool is_custom_app, bool is_local)
{
u32 *from = vlib_frame_vector_args (frame);
u32 n_left_from, n_left_to_next, *to_next, next_index;
@@ -1077,55 +1214,95 @@ ip6_full_reassembly_inline (vlib_main_t * vm,
ip6_header_t *ip0 = vlib_buffer_get_current (b0);
ip6_frag_hdr_t *frag_hdr = NULL;
- ip6_ext_header_t *prev_hdr;
- if (ip6_ext_hdr (ip0->protocol))
+ ip6_ext_hdr_chain_t hdr_chain;
+ vnet_buffer_opaque_t *fvnb = vnet_buffer (b0);
+
+ int res = ip6_ext_header_walk (
+ b0, ip0, IP_PROTOCOL_IPV6_FRAGMENTATION, &hdr_chain);
+ if (res < 0 ||
+ hdr_chain.eh[res].protocol != IP_PROTOCOL_IPV6_FRAGMENTATION)
{
- frag_hdr =
- ip6_ext_header_find (vm, b0, ip0,
- IP_PROTOCOL_IPV6_FRAGMENTATION,
- &prev_hdr);
+ vlib_node_increment_counter (vm, node->node_index,
+ IP6_ERROR_REASS_NO_FRAG_HDR, 1);
+ // this is a mangled packet - no fragmentation
+ next0 = is_custom_app ? fvnb->ip.reass.error_next_index :
+ IP6_FULL_REASSEMBLY_NEXT_DROP;
+ ip6_full_reass_add_trace (vm, node, NULL, bi0, NULL, PASSTHROUGH,
+ ~0);
+ goto skip_reass;
}
- if (!frag_hdr)
+ if (is_local && !rm->is_local_reass_enabled)
{
- // this is a regular packet - no fragmentation
- next0 = IP6_FULL_REASSEMBLY_NEXT_INPUT;
+ next0 = IP6_FULL_REASSEMBLY_NEXT_DROP;
goto skip_reass;
}
+
+ /* Keep track of received fragments */
+ vlib_node_increment_counter (vm, node->node_index,
+ IP6_ERROR_REASS_FRAGMENTS_RCVD, 1);
+ frag_hdr =
+ ip6_ext_next_header_offset (ip0, hdr_chain.eh[res].offset);
vnet_buffer (b0)->ip.reass.ip6_frag_hdr_offset =
- (u8 *) frag_hdr - (u8 *) ip0;
+ hdr_chain.eh[res].offset;
if (0 == ip6_frag_hdr_offset (frag_hdr))
{
// first fragment - verify upper-layer is present
- if (!ip6_full_reass_verify_upper_layer_present
- (node, b0, frag_hdr))
+ if (!ip6_full_reass_verify_upper_layer_present (node, b0,
+ &hdr_chain))
{
- next0 = IP6_FULL_REASSEMBLY_NEXT_ICMP_ERROR;
+ next0 = is_custom_app ? fvnb->ip.reass.error_next_index :
+ IP6_FULL_REASSEMBLY_NEXT_ICMP_ERROR;
goto skip_reass;
}
}
- if (!ip6_full_reass_verify_fragment_multiple_8 (vm, b0, frag_hdr) ||
- !ip6_full_reass_verify_packet_size_lt_64k (vm, b0, frag_hdr))
+
+ if (!ip6_full_reass_verify_fragment_multiple_8 (vm, node, b0,
+ frag_hdr) ||
+ !ip6_full_reass_verify_packet_size_lt_64k (vm, node, b0,
+ frag_hdr))
{
- next0 = IP6_FULL_REASSEMBLY_NEXT_ICMP_ERROR;
+ next0 = is_custom_app ? fvnb->ip.reass.error_next_index :
+ IP6_FULL_REASSEMBLY_NEXT_ICMP_ERROR;
goto skip_reass;
}
+
+ int skip_bihash = 0;
ip6_full_reass_kv_t kv;
u8 do_handoff = 0;
- kv.k.as_u64[0] = ip0->src_address.as_u64[0];
- kv.k.as_u64[1] = ip0->src_address.as_u64[1];
- kv.k.as_u64[2] = ip0->dst_address.as_u64[0];
- kv.k.as_u64[3] = ip0->dst_address.as_u64[1];
- kv.k.as_u64[4] =
- ((u64) vec_elt (ip6_main.fib_index_by_sw_if_index,
- vnet_buffer (b0)->sw_if_index[VLIB_RX])) << 32 |
- (u64) frag_hdr->identification;
- kv.k.as_u64[5] = ip0->protocol;
+ if (0 == ip6_frag_hdr_offset (frag_hdr) &&
+ !ip6_frag_hdr_more (frag_hdr))
+ {
+ // this is atomic fragment and needs to be processed separately
+ skip_bihash = 1;
+ }
+ else
+ {
+ u32 fib_index =
+ (vnet_buffer (b0)->sw_if_index[VLIB_TX] == (u32) ~0) ?
+ vec_elt (ip6_main.fib_index_by_sw_if_index,
+ vnet_buffer (b0)->sw_if_index[VLIB_RX]) :
+ vnet_buffer (b0)->sw_if_index[VLIB_TX];
+ kv.k.as_u64[0] = ip0->src_address.as_u64[0];
+ kv.k.as_u64[1] = ip0->src_address.as_u64[1];
+ kv.k.as_u64[2] = ip0->dst_address.as_u64[0];
+ kv.k.as_u64[3] = ip0->dst_address.as_u64[1];
+ kv.k.as_u64[4] =
+ ((u64) fib_index) << 32 | (u64) frag_hdr->identification;
+ /* RFC 8200: The Next Header values in the Fragment headers of
+ * different fragments of the same original packet may differ.
+ * Only the value from the Offset zero fragment packet is used
+ * for reassembly.
+ *
+ * Also, IPv6 Header doesnt contain the protocol value unlike
+ * IPv4.*/
+ kv.k.as_u64[5] = 0;
+ }
- ip6_full_reass_t *reass =
- ip6_full_reass_find_or_create (vm, node, rm, rt, &kv, &icmp_bi,
- &do_handoff);
+ ip6_full_reass_t *reass = ip6_full_reass_find_or_create (
+ vm, node, rm, rt, &kv, &icmp_bi, &do_handoff, skip_bihash,
+ &n_left_to_next, &to_next);
if (reass)
{
@@ -1144,9 +1321,10 @@ ip6_full_reassembly_inline (vlib_main_t * vm,
else if (reass)
{
u32 handoff_thread_idx;
- switch (ip6_full_reass_update
- (vm, node, rm, rt, reass, &bi0, &next0, &error0,
- frag_hdr, is_custom_app, &handoff_thread_idx))
+ u32 counter = ~0;
+ switch (ip6_full_reass_update (
+ vm, node, rm, rt, reass, &bi0, &next0, &error0, frag_hdr,
+ is_custom_app, &handoff_thread_idx, skip_bihash))
{
case IP6_FULL_REASS_RC_OK:
/* nothing to do here */
@@ -1158,25 +1336,36 @@ ip6_full_reassembly_inline (vlib_main_t * vm,
handoff_thread_idx;
break;
case IP6_FULL_REASS_RC_TOO_MANY_FRAGMENTS:
- vlib_node_increment_counter (vm, node->node_index,
- IP6_ERROR_REASS_FRAGMENT_CHAIN_TOO_LONG,
- 1);
- ip6_full_reass_drop_all (vm, node, reass);
- ip6_full_reass_free (rm, rt, reass);
- goto next_packet;
+ counter = IP6_ERROR_REASS_FRAGMENT_CHAIN_TOO_LONG;
break;
case IP6_FULL_REASS_RC_NO_BUF:
- vlib_node_increment_counter (vm, node->node_index,
- IP6_ERROR_REASS_NO_BUF, 1);
- ip6_full_reass_drop_all (vm, node, reass);
- ip6_full_reass_free (rm, rt, reass);
- goto next_packet;
+ counter = IP6_ERROR_REASS_NO_BUF;
+ break;
+ case IP6_FULL_REASS_RC_INVALID_FRAG_LEN:
+ counter = IP6_ERROR_REASS_INVALID_FRAG_LEN;
+ break;
+ case IP6_FULL_REASS_RC_OVERLAP:
+ counter = IP6_ERROR_REASS_OVERLAPPING_FRAGMENT;
break;
case IP6_FULL_REASS_RC_INTERNAL_ERROR:
- vlib_node_increment_counter (vm, node->node_index,
- IP6_ERROR_REASS_INTERNAL_ERROR,
+ counter = IP6_ERROR_REASS_INTERNAL_ERROR;
+ /* Sanitization is needed in internal error cases only, as
+ * the incoming packet is already dropped in other cases,
+ * also adding bi0 back to the reassembly list, fixes the
+ * leaking of buffers during internal errors.
+ *
+ * Also it doesnt make sense to send these buffers custom
+ * app, these fragments are with internal errors */
+ sanitize_reass_buffers_add_missing (vm, reass, &bi0);
+ reass->error_next_index = ~0;
+ break;
+ }
+ if (~0 != counter)
+ {
+ vlib_node_increment_counter (vm, node->node_index, counter,
1);
- ip6_full_reass_drop_all (vm, node, reass);
+ ip6_full_reass_drop_all (vm, node, reass, &n_left_to_next,
+ &to_next);
ip6_full_reass_free (rm, rt, reass);
goto next_packet;
break;
@@ -1190,7 +1379,6 @@ ip6_full_reassembly_inline (vlib_main_t * vm,
}
else
{
- vnet_buffer_opaque_t *fvnb = vnet_buffer (b0);
next0 = fvnb->ip.reass.error_next_index;
}
error0 = IP6_ERROR_REASS_LIMIT_REACHED;
@@ -1223,6 +1411,15 @@ ip6_full_reassembly_inline (vlib_main_t * vm,
{
vnet_feature_next (&next0, b0);
}
+
+ /* Increment the counter to-custom-app also as this fragment is
+ * also going to application */
+ if (is_custom_app)
+ {
+ vlib_node_increment_counter (
+ vm, node->node_index, IP6_ERROR_REASS_TO_CUSTOM_APP, 1);
+ }
+
vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
n_left_to_next, bi0, next0);
}
@@ -1249,26 +1446,21 @@ ip6_full_reassembly_inline (vlib_main_t * vm,
return frame->n_vectors;
}
-static char *ip6_full_reassembly_error_strings[] = {
-#define _(sym, string) string,
- foreach_ip6_error
-#undef _
-};
-
VLIB_NODE_FN (ip6_full_reass_node) (vlib_main_t * vm,
vlib_node_runtime_t * node,
vlib_frame_t * frame)
{
- return ip6_full_reassembly_inline (vm, node, frame, false /* is_feature */ ,
- false /* is_custom_app */ );
+ return ip6_full_reassembly_inline (vm, node, frame, false /* is_feature */,
+ false /* is_custom_app */,
+ false /* is_local */);
}
VLIB_REGISTER_NODE (ip6_full_reass_node) = {
.name = "ip6-full-reassembly",
.vector_size = sizeof (u32),
.format_trace = format_ip6_full_reass_trace,
- .n_errors = ARRAY_LEN (ip6_full_reassembly_error_strings),
- .error_strings = ip6_full_reassembly_error_strings,
+ .n_errors = IP6_N_ERROR,
+ .error_counters = ip6_error_counters,
.n_next_nodes = IP6_FULL_REASSEMBLY_N_NEXT,
.next_nodes =
{
@@ -1279,20 +1471,45 @@ VLIB_REGISTER_NODE (ip6_full_reass_node) = {
},
};
+VLIB_NODE_FN (ip6_local_full_reass_node)
+(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
+{
+ return ip6_full_reassembly_inline (vm, node, frame, false /* is_feature */,
+ false /* is_custom_app */,
+ true /* is_local */);
+}
+
+VLIB_REGISTER_NODE (ip6_local_full_reass_node) = {
+ .name = "ip6-local-full-reassembly",
+ .vector_size = sizeof (u32),
+ .format_trace = format_ip6_full_reass_trace,
+ .n_errors = IP6_N_ERROR,
+ .error_counters = ip6_error_counters,
+ .n_next_nodes = IP6_FULL_REASSEMBLY_N_NEXT,
+ .next_nodes =
+ {
+ [IP6_FULL_REASSEMBLY_NEXT_INPUT] = "ip6-input",
+ [IP6_FULL_REASSEMBLY_NEXT_DROP] = "ip6-drop",
+ [IP6_FULL_REASSEMBLY_NEXT_ICMP_ERROR] = "ip6-icmp-error",
+ [IP6_FULL_REASSEMBLY_NEXT_HANDOFF] = "ip6-local-full-reassembly-handoff",
+ },
+};
+
VLIB_NODE_FN (ip6_full_reass_node_feature) (vlib_main_t * vm,
vlib_node_runtime_t * node,
vlib_frame_t * frame)
{
- return ip6_full_reassembly_inline (vm, node, frame, true /* is_feature */ ,
- false /* is_custom_app */ );
+ return ip6_full_reassembly_inline (vm, node, frame, true /* is_feature */,
+ false /* is_custom_app */,
+ false /* is_local */);
}
VLIB_REGISTER_NODE (ip6_full_reass_node_feature) = {
.name = "ip6-full-reassembly-feature",
.vector_size = sizeof (u32),
.format_trace = format_ip6_full_reass_trace,
- .n_errors = ARRAY_LEN (ip6_full_reassembly_error_strings),
- .error_strings = ip6_full_reassembly_error_strings,
+ .n_errors = IP6_N_ERROR,
+ .error_counters = ip6_error_counters,
.n_next_nodes = IP6_FULL_REASSEMBLY_N_NEXT,
.next_nodes =
{
@@ -1311,6 +1528,30 @@ VNET_FEATURE_INIT (ip6_full_reassembly_feature, static) = {
.runs_after = 0,
};
+VLIB_NODE_FN (ip6_full_reass_node_custom)
+(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
+{
+ return ip6_full_reassembly_inline (vm, node, frame, false /* is_feature */,
+ true /* is_custom_app */,
+ false /* is_local */);
+}
+
+VLIB_REGISTER_NODE (ip6_full_reass_node_custom) = {
+ .name = "ip6-full-reassembly-custom",
+ .vector_size = sizeof (u32),
+ .format_trace = format_ip6_full_reass_trace,
+ .n_errors = IP6_N_ERROR,
+ .error_counters = ip6_error_counters,
+ .n_next_nodes = IP6_FULL_REASSEMBLY_N_NEXT,
+ .next_nodes =
+ {
+ [IP6_FULL_REASSEMBLY_NEXT_INPUT] = "ip6-input",
+ [IP6_FULL_REASSEMBLY_NEXT_DROP] = "ip6-drop",
+ [IP6_FULL_REASSEMBLY_NEXT_ICMP_ERROR] = "ip6-icmp-error",
+ [IP6_FULL_REASSEMBLY_NEXT_HANDOFF] = "ip6-full-reass-custom-hoff",
+ },
+};
+
#ifndef CLIB_MARCH_VARIANT
static u32
ip6_full_reass_get_nbuckets ()
@@ -1319,7 +1560,9 @@ ip6_full_reass_get_nbuckets ()
u32 nbuckets;
u8 i;
- nbuckets = (u32) (rm->max_reass_n / IP6_FULL_REASS_HT_LOAD_FACTOR);
+ /* need more mem with more workers */
+ nbuckets = (u32) (rm->max_reass_n * (vlib_num_workers () + 1) /
+ IP6_FULL_REASS_HT_LOAD_FACTOR);
for (i = 0; i < 31; i++)
if ((1 << i) >= nbuckets)
@@ -1446,9 +1689,6 @@ ip6_full_reass_init_function (vlib_main_t * vm)
clib_bihash_init_48_8 (&rm->hash, "ip6-full-reass", nbuckets,
nbuckets * 1024);
- node = vlib_get_node_by_name (vm, (u8 *) "ip6-drop");
- ASSERT (node);
- rm->ip6_drop_idx = node->index;
node = vlib_get_node_by_name (vm, (u8 *) "ip6-icmp-error");
ASSERT (node);
rm->ip6_icmp_error_idx = node->index;
@@ -1456,11 +1696,16 @@ ip6_full_reass_init_function (vlib_main_t * vm)
if ((error = vlib_call_init_function (vm, ip_main_init)))
return error;
ip6_register_protocol (IP_PROTOCOL_IPV6_FRAGMENTATION,
- ip6_full_reass_node.index);
+ ip6_local_full_reass_node.index);
+ rm->is_local_reass_enabled = 1;
rm->fq_index = vlib_frame_queue_main_init (ip6_full_reass_node.index, 0);
+ rm->fq_local_index =
+ vlib_frame_queue_main_init (ip6_local_full_reass_node.index, 0);
rm->fq_feature_index =
vlib_frame_queue_main_init (ip6_full_reass_node_feature.index, 0);
+ rm->fq_custom_index =
+ vlib_frame_queue_main_init (ip6_full_reass_node_custom.index, 0);
rm->feature_use_refcount_per_intf = NULL;
return error;
@@ -1504,26 +1749,53 @@ ip6_full_reass_walk_expired (vlib_main_t *vm, vlib_node_runtime_t *node,
int index;
const uword nthreads = vlib_num_workers () + 1;
u32 *vec_icmp_bi = NULL;
+ u32 n_left_to_next, *to_next;
+
for (thread_index = 0; thread_index < nthreads; ++thread_index)
{
ip6_full_reass_per_thread_t *rt =
&rm->per_thread_data[thread_index];
+ u32 reass_timeout_cnt = 0;
clib_spinlock_lock (&rt->lock);
vec_reset_length (pool_indexes_to_free);
- pool_foreach_index (index, rt->pool) {
- reass = pool_elt_at_index (rt->pool, index);
- if (now > reass->last_heard + rm->timeout)
- {
- vec_add1 (pool_indexes_to_free, index);
- }
- }
+ /* Pace the number of timeouts handled per thread,to avoid barrier
+ * sync issues in real world scenarios */
+
+ u32 beg = rt->last_id;
+ /* to ensure we walk at least once per sec per context */
+ u32 end = beg + (IP6_FULL_REASS_MAX_REASSEMBLIES_DEFAULT *
+ IP6_FULL_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS /
+ MSEC_PER_SEC +
+ 1);
+ if (end > vec_len (rt->pool))
+ {
+ end = vec_len (rt->pool);
+ rt->last_id = 0;
+ }
+ else
+ {
+ rt->last_id = end;
+ }
+
+ pool_foreach_stepping_index (index, beg, end, rt->pool)
+ {
+ reass = pool_elt_at_index (rt->pool, index);
+ if (now > reass->last_heard + rm->timeout)
+ {
+ vec_add1 (pool_indexes_to_free, index);
+ }
+ }
+
int *i;
vec_foreach (i, pool_indexes_to_free)
{
ip6_full_reass_t *reass = pool_elt_at_index (rt->pool, i[0]);
u32 icmp_bi = ~0;
- ip6_full_reass_on_timeout (vm, node, reass, &icmp_bi);
+
+ reass_timeout_cnt += reass->fragments_n;
+ ip6_full_reass_on_timeout (vm, node, reass, &icmp_bi,
+ &n_left_to_next, &to_next);
if (~0 != icmp_bi)
vec_add1 (vec_icmp_bi, icmp_bi);
@@ -1531,6 +1803,10 @@ ip6_full_reass_walk_expired (vlib_main_t *vm, vlib_node_runtime_t *node,
}
clib_spinlock_unlock (&rt->lock);
+ if (reass_timeout_cnt)
+ vlib_node_increment_counter (vm, node->node_index,
+ IP6_ERROR_REASS_TIMEOUT,
+ reass_timeout_cnt);
}
while (vec_len (vec_icmp_bi) > 0)
@@ -1546,7 +1822,6 @@ ip6_full_reass_walk_expired (vlib_main_t *vm, vlib_node_runtime_t *node,
vlib_buffer_t *b = vlib_get_buffer (vm, bi);
if (PREDICT_FALSE (b->flags & VLIB_BUFFER_IS_TRACED))
trace_frame = 1;
- b->error = node->errors[IP6_ERROR_REASS_TIMEOUT];
to_next[0] = bi;
++f->n_vectors;
to_next += 1;
@@ -1560,7 +1835,7 @@ ip6_full_reass_walk_expired (vlib_main_t *vm, vlib_node_runtime_t *node,
vec_free (vec_icmp_bi);
if (event_data)
{
- _vec_len (event_data) = 0;
+ vec_set_len (event_data, 0);
}
}
@@ -1568,14 +1843,13 @@ ip6_full_reass_walk_expired (vlib_main_t *vm, vlib_node_runtime_t *node,
}
VLIB_REGISTER_NODE (ip6_full_reass_expire_node) = {
- .function = ip6_full_reass_walk_expired,
- .format_trace = format_ip6_full_reass_trace,
- .type = VLIB_NODE_TYPE_PROCESS,
- .name = "ip6-full-reassembly-expire-walk",
-
- .n_errors = ARRAY_LEN (ip6_full_reassembly_error_strings),
- .error_strings = ip6_full_reassembly_error_strings,
+ .function = ip6_full_reass_walk_expired,
+ .format_trace = format_ip6_full_reass_trace,
+ .type = VLIB_NODE_TYPE_PROCESS,
+ .name = "ip6-full-reassembly-expire-walk",
+ .n_errors = IP6_N_ERROR,
+ .error_counters = ip6_error_counters,
};
static u8 *
@@ -1733,9 +2007,10 @@ format_ip6_full_reassembly_handoff_trace (u8 * s, va_list * args)
}
always_inline uword
-ip6_full_reassembly_handoff_inline (vlib_main_t * vm,
- vlib_node_runtime_t * node,
- vlib_frame_t * frame, bool is_feature)
+ip6_full_reassembly_handoff_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
+ vlib_frame_t *frame,
+ ip6_full_reass_node_type_t type,
+ bool is_local)
{
ip6_full_reass_main_t *rm = &ip6_full_reass_main;
@@ -1751,8 +2026,28 @@ ip6_full_reassembly_handoff_inline (vlib_main_t * vm,
b = bufs;
ti = thread_indices;
- fq_index = (is_feature) ? rm->fq_feature_index : rm->fq_index;
-
+ switch (type)
+ {
+ case NORMAL:
+ if (is_local)
+ {
+ fq_index = rm->fq_local_index;
+ }
+ else
+ {
+ fq_index = rm->fq_index;
+ }
+ break;
+ case FEATURE:
+ fq_index = rm->fq_feature_index;
+ break;
+ case CUSTOM:
+ fq_index = rm->fq_custom_index;
+ break;
+ default:
+ clib_warning ("Unexpected `type' (%d)!", type);
+ ASSERT (0);
+ }
while (n_left_from > 0)
{
ti[0] = vnet_buffer (b[0])->ip.reass.owner_thread_index;
@@ -1784,8 +2079,8 @@ VLIB_NODE_FN (ip6_full_reassembly_handoff_node) (vlib_main_t * vm,
vlib_node_runtime_t * node,
vlib_frame_t * frame)
{
- return ip6_full_reassembly_handoff_inline (vm, node, frame,
- false /* is_feature */ );
+ return ip6_full_reassembly_handoff_inline (vm, node, frame, NORMAL,
+ false /* is_local */);
}
VLIB_REGISTER_NODE (ip6_full_reassembly_handoff_node) = {
@@ -1802,14 +2097,34 @@ VLIB_REGISTER_NODE (ip6_full_reassembly_handoff_node) = {
},
};
+VLIB_NODE_FN (ip6_local_full_reassembly_handoff_node)
+(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
+{
+ return ip6_full_reassembly_handoff_inline (vm, node, frame, NORMAL,
+ true /* is_feature */);
+}
+
+VLIB_REGISTER_NODE (ip6_local_full_reassembly_handoff_node) = {
+ .name = "ip6-local-full-reassembly-handoff",
+ .vector_size = sizeof (u32),
+ .n_errors = ARRAY_LEN(ip6_full_reassembly_handoff_error_strings),
+ .error_strings = ip6_full_reassembly_handoff_error_strings,
+ .format_trace = format_ip6_full_reassembly_handoff_trace,
+
+ .n_next_nodes = 1,
+
+ .next_nodes = {
+ [0] = "error-drop",
+ },
+};
VLIB_NODE_FN (ip6_full_reassembly_feature_handoff_node) (vlib_main_t * vm,
vlib_node_runtime_t * node, vlib_frame_t * frame)
{
- return ip6_full_reassembly_handoff_inline (vm, node, frame, true /* is_feature */ );
+ return ip6_full_reassembly_handoff_inline (vm, node, frame, FEATURE,
+ false /* is_local */);
}
-
VLIB_REGISTER_NODE (ip6_full_reassembly_feature_handoff_node) = {
.name = "ip6-full-reass-feature-hoff",
.vector_size = sizeof (u32),
@@ -1824,6 +2139,27 @@ VLIB_REGISTER_NODE (ip6_full_reassembly_feature_handoff_node) = {
},
};
+VLIB_NODE_FN (ip6_full_reassembly_custom_handoff_node)
+(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
+{
+ return ip6_full_reassembly_handoff_inline (vm, node, frame, CUSTOM,
+ false /* is_local */);
+}
+
+VLIB_REGISTER_NODE (ip6_full_reassembly_custom_handoff_node) = {
+ .name = "ip6-full-reass-custom-hoff",
+ .vector_size = sizeof (u32),
+ .n_errors = ARRAY_LEN(ip6_full_reassembly_handoff_error_strings),
+ .error_strings = ip6_full_reassembly_handoff_error_strings,
+ .format_trace = format_ip6_full_reassembly_handoff_trace,
+
+ .n_next_nodes = 1,
+
+ .next_nodes = {
+ [0] = "error-drop",
+ },
+};
+
#ifndef CLIB_MARCH_VARIANT
int
ip6_full_reass_enable_disable_with_refcnt (u32 sw_if_index, int is_enable)
@@ -1849,8 +2185,37 @@ ip6_full_reass_enable_disable_with_refcnt (u32 sw_if_index, int is_enable)
"ip6-full-reassembly-feature",
sw_if_index, 0, 0, 0);
}
- return -1;
+ return 0;
+}
+
+void
+ip6_local_full_reass_enable_disable (int enable)
+{
+ if (enable)
+ {
+ if (!ip6_full_reass_main.is_local_reass_enabled)
+ {
+ ip6_full_reass_main.is_local_reass_enabled = 1;
+ ip6_register_protocol (IP_PROTOCOL_IPV6_FRAGMENTATION,
+ ip6_local_full_reass_node.index);
+ }
+ }
+ else
+ {
+ if (ip6_full_reass_main.is_local_reass_enabled)
+ {
+ ip6_full_reass_main.is_local_reass_enabled = 0;
+ ip6_unregister_protocol (IP_PROTOCOL_IPV6_FRAGMENTATION);
+ }
+ }
+}
+
+int
+ip6_local_full_reass_enabled ()
+{
+ return ip6_full_reass_main.is_local_reass_enabled;
}
+
#endif
/*
diff --git a/src/vnet/ip/reass/ip6_full_reass.h b/src/vnet/ip/reass/ip6_full_reass.h
index 546075b04b4..f66cb67d796 100644
--- a/src/vnet/ip/reass/ip6_full_reass.h
+++ b/src/vnet/ip/reass/ip6_full_reass.h
@@ -46,6 +46,8 @@ vnet_api_error_t ip6_full_reass_enable_disable (u32 sw_if_index,
int ip6_full_reass_enable_disable_with_refcnt (u32 sw_if_index,
int is_enable);
+void ip6_local_full_reass_enable_disable (int enable);
+int ip6_local_full_reass_enabled ();
#endif /* __included_ip6_full_reass_h */
/*
diff --git a/src/vnet/ip/reass/ip6_sv_reass.c b/src/vnet/ip/reass/ip6_sv_reass.c
index 28941311f50..fe2ed05555c 100644
--- a/src/vnet/ip/reass/ip6_sv_reass.c
+++ b/src/vnet/ip/reass/ip6_sv_reass.c
@@ -26,6 +26,7 @@
#include <vnet/ip/ip6_to_ip4.h>
#include <vppinfra/bihash_48_8.h>
#include <vnet/ip/reass/ip6_sv_reass.h>
+#include <vnet/ip/ip6_inlines.h>
#define MSEC_PER_SEC 1000
#define IP6_SV_REASS_TIMEOUT_DEFAULT_MS 100
@@ -40,6 +41,7 @@ typedef enum
IP6_SV_REASS_RC_TOO_MANY_FRAGMENTS,
IP6_SV_REASS_RC_INTERNAL_ERROR,
IP6_SV_REASS_RC_UNSUPP_IP_PROTO,
+ IP6_SV_REASS_RC_INVALID_FRAG_LEN,
} ip6_sv_reass_rc_t;
typedef struct
@@ -50,7 +52,7 @@ typedef struct
{
ip6_address_t src;
ip6_address_t dst;
- u32 xx_id;
+ u32 fib_index;
u32 frag_id;
u8 unused[7];
u8 proto;
@@ -148,6 +150,7 @@ typedef struct
/** Worker handoff */
u32 fq_index;
u32 fq_feature_index;
+ u32 fq_custom_context_index;
// reference count for enabling/disabling feature - per interface
u32 *feature_use_refcount_per_intf;
@@ -214,7 +217,7 @@ format_ip6_sv_reass_trace (u8 * s, va_list * args)
clib_net_to_host_u16 (t->l4_dst_port));
break;
case REASS_PASSTHROUGH:
- s = format (s, "[not-fragmented]");
+ s = format (s, "[not fragmented or atomic fragment]");
break;
}
return s;
@@ -309,6 +312,8 @@ ip6_sv_reass_find_or_create (vlib_main_t *vm, ip6_sv_reass_main_t *rm,
ip6_sv_reass_t *reass = NULL;
f64 now = vlib_time_now (vm);
+again:
+
if (!clib_bihash_search_48_8 (&rm->hash, &kv->kv, &kv->kv))
{
if (vm->thread_index != kv->v.thread_index)
@@ -368,10 +373,14 @@ ip6_sv_reass_find_or_create (vlib_main_t *vm, ip6_sv_reass_main_t *rm,
kv->v.thread_index = vm->thread_index;
reass->last_heard = now;
- if (clib_bihash_add_del_48_8 (&rm->hash, &kv->kv, 1))
+ int rv = clib_bihash_add_del_48_8 (&rm->hash, &kv->kv, 2);
+ if (rv)
{
ip6_sv_reass_free (vm, rm, rt, reass);
reass = NULL;
+ // if other worker created a context already work with the other copy
+ if (-2 == rv)
+ goto again;
}
return reass;
@@ -399,6 +408,10 @@ ip6_sv_reass_update (vlib_main_t *vm, vlib_node_runtime_t *node,
u32 fragment_length =
vlib_buffer_length_in_chain (vm, fb) -
(fvnb->ip.reass.ip6_frag_hdr_offset + sizeof (*frag_hdr));
+ if (0 == fragment_length)
+ {
+ return IP6_SV_REASS_RC_INVALID_FRAG_LEN;
+ }
u32 fragment_last = fvnb->ip.reass.fragment_last =
fragment_first + fragment_length - 1;
fvnb->ip.reass.range_first = fragment_first;
@@ -440,22 +453,18 @@ ip6_sv_reass_update (vlib_main_t *vm, vlib_node_runtime_t *node,
}
always_inline bool
-ip6_sv_reass_verify_upper_layer_present (vlib_node_runtime_t * node,
- vlib_buffer_t * b,
- ip6_frag_hdr_t * frag_hdr)
+ip6_sv_reass_verify_upper_layer_present (vlib_node_runtime_t *node,
+ vlib_buffer_t *b,
+ ip6_ext_hdr_chain_t *hc)
{
- ip6_ext_header_t *tmp = (ip6_ext_header_t *) frag_hdr;
- while (ip6_ext_hdr (tmp->next_hdr))
+ int nh = hc->eh[hc->length - 1].protocol;
+ /* Checking to see if it's a terminating header */
+ if (ip6_ext_hdr (nh))
{
- tmp = ip6_ext_next_header (tmp);
- }
- if (IP_PROTOCOL_IP6_NONXT == tmp->next_hdr)
- {
- icmp6_error_set_vnet_buffer (b, ICMP6_parameter_problem,
- ICMP6_parameter_problem_first_fragment_has_incomplete_header_chain,
- 0);
+ icmp6_error_set_vnet_buffer (
+ b, ICMP6_parameter_problem,
+ ICMP6_parameter_problem_first_fragment_has_incomplete_header_chain, 0);
b->error = node->errors[IP6_ERROR_REASS_MISSING_UPPER];
-
return false;
}
return true;
@@ -505,14 +514,18 @@ ip6_sv_reass_verify_packet_size_lt_64k (vlib_main_t * vm,
}
always_inline uword
-ip6_sv_reassembly_inline (vlib_main_t * vm,
- vlib_node_runtime_t * node,
- vlib_frame_t * frame, bool is_feature)
+ip6_sv_reassembly_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
+ vlib_frame_t *frame, bool is_feature,
+ bool custom_next, bool custom_context)
{
u32 *from = vlib_frame_vector_args (frame);
- u32 n_left_from, n_left_to_next, *to_next, next_index;
+ u32 n_left_from, n_left_to_next, *to_next, *to_next_aux, next_index;
ip6_sv_reass_main_t *rm = &ip6_sv_reass_main;
ip6_sv_reass_per_thread_t *rt = &rm->per_thread_data[vm->thread_index];
+ u32 *context;
+ if (custom_context)
+ context = vlib_frame_aux_args (frame);
+
clib_spinlock_lock (&rt->lock);
n_left_from = frame->n_vectors;
@@ -520,7 +533,11 @@ ip6_sv_reassembly_inline (vlib_main_t * vm,
while (n_left_from > 0)
{
- vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+ if (custom_context)
+ vlib_get_next_frame_with_aux_safe (vm, node, next_index, to_next,
+ to_next_aux, n_left_to_next);
+ else
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
while (n_left_from > 0 && n_left_to_next > 0)
{
@@ -528,23 +545,31 @@ ip6_sv_reassembly_inline (vlib_main_t * vm,
vlib_buffer_t *b0;
u32 next0 = IP6_SV_REASSEMBLY_NEXT_DROP;
u32 error0 = IP6_ERROR_NONE;
-
+ u8 forward_context = 0;
bi0 = from[0];
b0 = vlib_get_buffer (vm, bi0);
ip6_header_t *ip0 = vlib_buffer_get_current (b0);
- ip6_frag_hdr_t *frag_hdr = NULL;
- ip6_ext_header_t *prev_hdr;
- if (ip6_ext_hdr (ip0->protocol))
+ ip6_frag_hdr_t *frag_hdr;
+ ip6_ext_hdr_chain_t hdr_chain;
+ bool is_atomic_fragment = false;
+
+ int res = ip6_ext_header_walk (
+ b0, ip0, IP_PROTOCOL_IPV6_FRAGMENTATION, &hdr_chain);
+ if (res >= 0 &&
+ hdr_chain.eh[res].protocol == IP_PROTOCOL_IPV6_FRAGMENTATION)
{
frag_hdr =
- ip6_ext_header_find (vm, b0, ip0,
- IP_PROTOCOL_IPV6_FRAGMENTATION,
- &prev_hdr);
+ ip6_ext_next_header_offset (ip0, hdr_chain.eh[res].offset);
+ is_atomic_fragment = (0 == ip6_frag_hdr_offset (frag_hdr) &&
+ !ip6_frag_hdr_more (frag_hdr));
}
- if (!frag_hdr)
+
+ if (res < 0 ||
+ hdr_chain.eh[res].protocol != IP_PROTOCOL_IPV6_FRAGMENTATION ||
+ is_atomic_fragment)
{
- // this is a regular packet - no fragmentation
+ // this is a regular unfragmented packet or an atomic fragment
if (!ip6_get_port
(vm, b0, ip0, b0->current_length,
&(vnet_buffer (b0)->ip.reass.ip_proto),
@@ -560,7 +585,8 @@ ip6_sv_reassembly_inline (vlib_main_t * vm,
goto packet_enqueue;
}
vnet_buffer (b0)->ip.reass.is_non_first_fragment = 0;
- next0 = IP6_SV_REASSEMBLY_NEXT_INPUT;
+ next0 = custom_next ? vnet_buffer (b0)->ip.reass.next_index :
+ IP6_SV_REASSEMBLY_NEXT_INPUT;
if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
{
ip6_sv_reass_add_trace (
@@ -571,13 +597,15 @@ ip6_sv_reassembly_inline (vlib_main_t * vm,
}
goto packet_enqueue;
}
+
vnet_buffer (b0)->ip.reass.ip6_frag_hdr_offset =
- (u8 *) frag_hdr - (u8 *) ip0;
+ hdr_chain.eh[res].offset;
+
if (0 == ip6_frag_hdr_offset (frag_hdr))
{
// first fragment - verify upper-layer is present
- if (!ip6_sv_reass_verify_upper_layer_present
- (node, b0, frag_hdr))
+ if (!ip6_sv_reass_verify_upper_layer_present (node, b0,
+ &hdr_chain))
{
next0 = IP6_SV_REASSEMBLY_NEXT_ICMP_ERROR;
goto packet_enqueue;
@@ -597,10 +625,15 @@ ip6_sv_reassembly_inline (vlib_main_t * vm,
kv.k.as_u64[1] = ip0->src_address.as_u64[1];
kv.k.as_u64[2] = ip0->dst_address.as_u64[0];
kv.k.as_u64[3] = ip0->dst_address.as_u64[1];
- kv.k.as_u64[4] =
- ((u64) vec_elt (ip6_main.fib_index_by_sw_if_index,
- vnet_buffer (b0)->sw_if_index[VLIB_RX])) << 32 |
- (u64) frag_hdr->identification;
+ if (custom_context)
+ kv.k.as_u64[4] =
+ (u64) *context << 32 | (u64) frag_hdr->identification;
+ else
+ kv.k.as_u64[4] =
+ ((u64) vec_elt (ip6_main.fib_index_by_sw_if_index,
+ vnet_buffer (b0)->sw_if_index[VLIB_RX]))
+ << 32 |
+ (u64) frag_hdr->identification;
kv.k.as_u64[5] = ip0->protocol;
ip6_sv_reass_t *reass =
@@ -611,6 +644,8 @@ ip6_sv_reassembly_inline (vlib_main_t * vm,
next0 = IP6_SV_REASSEMBLY_NEXT_HANDOFF;
vnet_buffer (b0)->ip.reass.owner_thread_index =
kv.v.thread_index;
+ if (custom_context)
+ forward_context = 1;
goto packet_enqueue;
}
@@ -635,7 +670,8 @@ ip6_sv_reassembly_inline (vlib_main_t * vm,
reass->tcp_seq_number;
vnet_buffer (b0)->ip.reass.l4_src_port = reass->l4_src_port;
vnet_buffer (b0)->ip.reass.l4_dst_port = reass->l4_dst_port;
- next0 = IP6_SV_REASSEMBLY_NEXT_INPUT;
+ next0 = custom_next ? vnet_buffer (b0)->ip.reass.next_index :
+ IP6_SV_REASSEMBLY_NEXT_INPUT;
if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
{
ip6_sv_reass_add_trace (
@@ -645,31 +681,30 @@ ip6_sv_reassembly_inline (vlib_main_t * vm,
goto packet_enqueue;
}
+ u32 counter = ~0;
switch (ip6_sv_reass_update (vm, node, rm, reass, bi0, frag_hdr))
{
case IP6_SV_REASS_RC_OK:
/* nothing to do here */
break;
case IP6_SV_REASS_RC_TOO_MANY_FRAGMENTS:
- vlib_node_increment_counter (vm, node->node_index,
- IP6_ERROR_REASS_FRAGMENT_CHAIN_TOO_LONG,
- 1);
- ip6_sv_reass_free (vm, rm, rt, reass);
- goto next_packet;
+ counter = IP6_ERROR_REASS_FRAGMENT_CHAIN_TOO_LONG;
break;
case IP6_SV_REASS_RC_UNSUPP_IP_PROTO:
- vlib_node_increment_counter (vm, node->node_index,
- IP6_ERROR_REASS_UNSUPP_IP_PROTO,
- 1);
- ip6_sv_reass_free (vm, rm, rt, reass);
- goto next_packet;
+ counter = IP6_ERROR_REASS_UNSUPP_IP_PROTO;
break;
case IP6_SV_REASS_RC_INTERNAL_ERROR:
- vlib_node_increment_counter (vm, node->node_index,
- IP6_ERROR_REASS_INTERNAL_ERROR, 1);
+ counter = IP6_ERROR_REASS_INTERNAL_ERROR;
+ break;
+ case IP6_SV_REASS_RC_INVALID_FRAG_LEN:
+ counter = IP6_ERROR_REASS_INVALID_FRAG_LEN;
+ break;
+ }
+ if (~0 != counter)
+ {
+ vlib_node_increment_counter (vm, node->node_index, counter, 1);
ip6_sv_reass_free (vm, rm, rt, reass);
goto next_packet;
- break;
}
if (reass->is_complete)
@@ -717,7 +752,8 @@ ip6_sv_reassembly_inline (vlib_main_t * vm,
to_next, n_left_to_next, bi0,
next0);
}
- _vec_len (reass->cached_buffers) = 0; // buffers are owned by frame now
+ vec_set_len (reass->cached_buffers,
+ 0); // buffers are owned by frame now
}
goto next_packet;
@@ -730,11 +766,25 @@ ip6_sv_reassembly_inline (vlib_main_t * vm,
b0 = vlib_get_buffer (vm, bi0);
vnet_feature_next (&next0, b0);
}
- vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
- n_left_to_next, bi0, next0);
+ if (custom_context && forward_context)
+ {
+ if (to_next_aux)
+ {
+ to_next_aux[0] = *context;
+ to_next_aux += 1;
+ }
+ vlib_validate_buffer_enqueue_with_aux_x1 (
+ vm, node, next_index, to_next, to_next_aux, n_left_to_next,
+ bi0, *context, next0);
+ }
+ else
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
+ n_left_to_next, bi0, next0);
next_packet:
from += 1;
+ if (custom_context)
+ context += 1;
n_left_from -= 1;
}
@@ -745,26 +795,21 @@ ip6_sv_reassembly_inline (vlib_main_t * vm,
return frame->n_vectors;
}
-static char *ip6_sv_reassembly_error_strings[] = {
-#define _(sym, string) string,
- foreach_ip6_error
-#undef _
-};
-
VLIB_NODE_FN (ip6_sv_reass_node) (vlib_main_t * vm,
vlib_node_runtime_t * node,
vlib_frame_t * frame)
{
- return ip6_sv_reassembly_inline (vm, node, frame, false /* is_feature */ );
+ return ip6_sv_reassembly_inline (vm, node, frame, false /* is_feature */,
+ false /* custom next */,
+ false /* custom context */);
}
-/* *INDENT-OFF* */
VLIB_REGISTER_NODE (ip6_sv_reass_node) = {
.name = "ip6-sv-reassembly",
.vector_size = sizeof (u32),
.format_trace = format_ip6_sv_reass_trace,
- .n_errors = ARRAY_LEN (ip6_sv_reassembly_error_strings),
- .error_strings = ip6_sv_reassembly_error_strings,
+ .n_errors = IP6_N_ERROR,
+ .error_counters = ip6_error_counters,
.n_next_nodes = IP6_SV_REASSEMBLY_N_NEXT,
.next_nodes =
{
@@ -774,22 +819,22 @@ VLIB_REGISTER_NODE (ip6_sv_reass_node) = {
[IP6_SV_REASSEMBLY_NEXT_HANDOFF] = "ip6-sv-reassembly-handoff",
},
};
-/* *INDENT-ON* */
VLIB_NODE_FN (ip6_sv_reass_node_feature) (vlib_main_t * vm,
vlib_node_runtime_t * node,
vlib_frame_t * frame)
{
- return ip6_sv_reassembly_inline (vm, node, frame, true /* is_feature */ );
+ return ip6_sv_reassembly_inline (vm, node, frame, true /* is_feature */,
+ false /* custom next */,
+ false /* custom context */);
}
-/* *INDENT-OFF* */
VLIB_REGISTER_NODE (ip6_sv_reass_node_feature) = {
.name = "ip6-sv-reassembly-feature",
.vector_size = sizeof (u32),
.format_trace = format_ip6_sv_reass_trace,
- .n_errors = ARRAY_LEN (ip6_sv_reassembly_error_strings),
- .error_strings = ip6_sv_reassembly_error_strings,
+ .n_errors = IP6_N_ERROR,
+ .error_counters = ip6_error_counters,
.n_next_nodes = IP6_SV_REASSEMBLY_N_NEXT,
.next_nodes =
{
@@ -799,16 +844,38 @@ VLIB_REGISTER_NODE (ip6_sv_reass_node_feature) = {
[IP6_SV_REASSEMBLY_NEXT_HANDOFF] = "ip6-sv-reass-feature-hoff",
},
};
-/* *INDENT-ON* */
-/* *INDENT-OFF* */
VNET_FEATURE_INIT (ip6_sv_reassembly_feature) = {
.arc_name = "ip6-unicast",
.node_name = "ip6-sv-reassembly-feature",
.runs_before = VNET_FEATURES ("ip6-lookup"),
.runs_after = 0,
};
-/* *INDENT-ON* */
+
+VLIB_NODE_FN (ip6_sv_reass_custom_context_node)
+(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
+{
+ return ip6_sv_reassembly_inline (vm, node, frame, false /* is_feature */,
+ true /* custom next */,
+ true /* custom context */);
+}
+
+VLIB_REGISTER_NODE (ip6_sv_reass_custom_context_node) = {
+ .name = "ip6-sv-reassembly-custom-context",
+ .vector_size = sizeof (u32),
+ .aux_size = sizeof (u32),
+ .format_trace = format_ip6_sv_reass_trace,
+ .n_errors = IP6_N_ERROR,
+ .error_counters = ip6_error_counters,
+ .n_next_nodes = IP6_SV_REASSEMBLY_N_NEXT,
+ .next_nodes =
+ {
+ [IP6_SV_REASSEMBLY_NEXT_INPUT] = "ip6-input",
+ [IP6_SV_REASSEMBLY_NEXT_DROP] = "ip6-drop",
+ [IP6_SV_REASSEMBLY_NEXT_ICMP_ERROR] = "ip6-icmp-error",
+ [IP6_SV_REASSEMBLY_NEXT_HANDOFF] = "ip6-sv-reassembly-custom-context-handoff",
+ },
+};
#ifndef CLIB_MARCH_VARIANT
static u32
@@ -959,6 +1026,8 @@ ip6_sv_reass_init_function (vlib_main_t * vm)
rm->fq_index = vlib_frame_queue_main_init (ip6_sv_reass_node.index, 0);
rm->fq_feature_index =
vlib_frame_queue_main_init (ip6_sv_reass_node_feature.index, 0);
+ rm->fq_custom_context_index =
+ vlib_frame_queue_main_init (ip6_sv_reass_custom_context_node.index, 0);
rm->feature_use_refcount_per_intf = NULL;
@@ -1009,7 +1078,6 @@ ip6_sv_reass_walk_expired (vlib_main_t *vm,
clib_spinlock_lock (&rt->lock);
vec_reset_length (pool_indexes_to_free);
- /* *INDENT-OFF* */
pool_foreach_index (index, rt->pool) {
reass = pool_elt_at_index (rt->pool, index);
if (now > reass->last_heard + rm->timeout)
@@ -1017,15 +1085,12 @@ ip6_sv_reass_walk_expired (vlib_main_t *vm,
vec_add1 (pool_indexes_to_free, index);
}
}
- /* *INDENT-ON* */
int *i;
- /* *INDENT-OFF* */
vec_foreach (i, pool_indexes_to_free)
{
ip6_sv_reass_t *reass = pool_elt_at_index (rt->pool, i[0]);
ip6_sv_reass_free (vm, rm, rt, reass);
}
- /* *INDENT-ON* */
clib_spinlock_unlock (&rt->lock);
}
@@ -1033,33 +1098,31 @@ ip6_sv_reass_walk_expired (vlib_main_t *vm,
vec_free (pool_indexes_to_free);
if (event_data)
{
- _vec_len (event_data) = 0;
+ vec_set_len (event_data, 0);
}
}
return 0;
}
-/* *INDENT-OFF* */
VLIB_REGISTER_NODE (ip6_sv_reass_expire_node) = {
- .function = ip6_sv_reass_walk_expired,
- .format_trace = format_ip6_sv_reass_trace,
- .type = VLIB_NODE_TYPE_PROCESS,
- .name = "ip6-sv-reassembly-expire-walk",
-
- .n_errors = ARRAY_LEN (ip6_sv_reassembly_error_strings),
- .error_strings = ip6_sv_reassembly_error_strings,
+ .function = ip6_sv_reass_walk_expired,
+ .format_trace = format_ip6_sv_reass_trace,
+ .type = VLIB_NODE_TYPE_PROCESS,
+ .name = "ip6-sv-reassembly-expire-walk",
+ .n_errors = IP6_N_ERROR,
+ .error_counters = ip6_error_counters,
};
-/* *INDENT-ON* */
static u8 *
format_ip6_sv_reass_key (u8 * s, va_list * args)
{
ip6_sv_reass_key_t *key = va_arg (*args, ip6_sv_reass_key_t *);
- s = format (s, "xx_id: %u, src: %U, dst: %U, frag_id: %u, proto: %u",
- key->xx_id, format_ip6_address, &key->src, format_ip6_address,
- &key->dst, clib_net_to_host_u16 (key->frag_id), key->proto);
+ s =
+ format (s, "fib_index: %u, src: %U, dst: %U, frag_id: %u, proto: %u",
+ key->fib_index, format_ip6_address, &key->src, format_ip6_address,
+ &key->dst, clib_net_to_host_u16 (key->frag_id), key->proto);
return s;
}
@@ -1116,11 +1179,9 @@ show_ip6_sv_reass (vlib_main_t * vm, unformat_input_t * input,
clib_spinlock_lock (&rt->lock);
if (details)
{
- /* *INDENT-OFF* */
pool_foreach (reass, rt->pool) {
vlib_cli_output (vm, "%U", format_ip6_sv_reass, vm, reass);
}
- /* *INDENT-ON* */
}
sum_reass_n += rt->reass_n;
clib_spinlock_unlock (&rt->lock);
@@ -1146,13 +1207,11 @@ show_ip6_sv_reass (vlib_main_t * vm, unformat_input_t * input,
return 0;
}
-/* *INDENT-OFF* */
VLIB_CLI_COMMAND (show_ip6_sv_reassembly_cmd, static) = {
.path = "show ip6-sv-reassembly",
.short_help = "show ip6-sv-reassembly [details]",
.function = show_ip6_sv_reass,
};
-/* *INDENT-ON* */
#ifndef CLIB_MARCH_VARIANT
vnet_api_error_t
@@ -1202,25 +1261,29 @@ format_ip6_sv_reassembly_handoff_trace (u8 * s, va_list * args)
}
always_inline uword
-ip6_sv_reassembly_handoff_inline (vlib_main_t * vm,
- vlib_node_runtime_t * node,
- vlib_frame_t * frame, bool is_feature)
+ip6_sv_reassembly_handoff_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
+ vlib_frame_t *frame, bool is_feature,
+ bool custom_context)
{
ip6_sv_reass_main_t *rm = &ip6_sv_reass_main;
vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
- u32 n_enq, n_left_from, *from;
+ u32 n_enq, n_left_from, *from, *context;
u16 thread_indices[VLIB_FRAME_SIZE], *ti;
u32 fq_index;
from = vlib_frame_vector_args (frame);
+ if (custom_context)
+ context = vlib_frame_aux_args (frame);
n_left_from = frame->n_vectors;
vlib_get_buffers (vm, from, bufs, n_left_from);
b = bufs;
ti = thread_indices;
- fq_index = (is_feature) ? rm->fq_feature_index : rm->fq_index;
+ fq_index = (is_feature) ?
+ rm->fq_feature_index :
+ (custom_context ? rm->fq_custom_context_index : rm->fq_index);
while (n_left_from > 0)
{
@@ -1239,8 +1302,12 @@ ip6_sv_reassembly_handoff_inline (vlib_main_t * vm,
ti += 1;
b += 1;
}
- n_enq = vlib_buffer_enqueue_to_thread (vm, node, fq_index, from,
- thread_indices, frame->n_vectors, 1);
+ if (custom_context)
+ n_enq = vlib_buffer_enqueue_to_thread_with_aux (
+ vm, node, fq_index, from, context, thread_indices, frame->n_vectors, 1);
+ else
+ n_enq = vlib_buffer_enqueue_to_thread (
+ vm, node, fq_index, from, thread_indices, frame->n_vectors, 1);
if (n_enq < frame->n_vectors)
vlib_node_increment_counter (vm, node->node_index,
@@ -1253,11 +1320,10 @@ VLIB_NODE_FN (ip6_sv_reassembly_handoff_node) (vlib_main_t * vm,
vlib_node_runtime_t * node,
vlib_frame_t * frame)
{
- return ip6_sv_reassembly_handoff_inline (vm, node, frame,
- false /* is_feature */ );
+ return ip6_sv_reassembly_handoff_inline (
+ vm, node, frame, false /* is_feature */, false /* custom_context */);
}
-/* *INDENT-OFF* */
VLIB_REGISTER_NODE (ip6_sv_reassembly_handoff_node) = {
.name = "ip6-sv-reassembly-handoff",
.vector_size = sizeof (u32),
@@ -1276,11 +1342,11 @@ VLIB_REGISTER_NODE (ip6_sv_reassembly_handoff_node) = {
VLIB_NODE_FN (ip6_sv_reassembly_feature_handoff_node) (vlib_main_t * vm,
vlib_node_runtime_t * node, vlib_frame_t * frame)
{
- return ip6_sv_reassembly_handoff_inline (vm, node, frame, true /* is_feature */ );
+ return ip6_sv_reassembly_handoff_inline (
+ vm, node, frame, true /* is_feature */, false /* custom_context */);
}
-/* *INDENT-OFF* */
VLIB_REGISTER_NODE (ip6_sv_reassembly_feature_handoff_node) = {
.name = "ip6-sv-reass-feature-hoff",
.vector_size = sizeof (u32),
@@ -1294,7 +1360,28 @@ VLIB_REGISTER_NODE (ip6_sv_reassembly_feature_handoff_node) = {
[0] = "error-drop",
},
};
-/* *INDENT-ON* */
+
+VLIB_NODE_FN (ip6_sv_reassembly_custom_context_handoff_node)
+(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
+{
+ return ip6_sv_reassembly_handoff_inline (
+ vm, node, frame, false /* is_feature */, true /* custom_context */);
+}
+
+VLIB_REGISTER_NODE (ip6_sv_reassembly_custom_context_handoff_node) = {
+ .name = "ip6-sv-reassembly-custom-context-handoff",
+ .vector_size = sizeof (u32),
+ .aux_size = sizeof (u32),
+ .n_errors = ARRAY_LEN(ip6_sv_reassembly_handoff_error_strings),
+ .error_strings = ip6_sv_reassembly_handoff_error_strings,
+ .format_trace = format_ip6_sv_reassembly_handoff_trace,
+
+ .n_next_nodes = 1,
+
+ .next_nodes = {
+ [0] = "error-drop",
+ },
+};
#ifndef CLIB_MARCH_VARIANT
int
@@ -1323,6 +1410,14 @@ ip6_sv_reass_enable_disable_with_refcnt (u32 sw_if_index, int is_enable)
}
return 0;
}
+
+uword
+ip6_sv_reass_custom_context_register_next_node (uword node_index)
+{
+ return vlib_node_add_next (
+ vlib_get_main (), ip6_sv_reassembly_custom_context_handoff_node.index,
+ node_index);
+}
#endif
/*
diff --git a/src/vnet/ip/reass/ip6_sv_reass.h b/src/vnet/ip/reass/ip6_sv_reass.h
index 81ac2312bdf..7dc9df132dd 100644
--- a/src/vnet/ip/reass/ip6_sv_reass.h
+++ b/src/vnet/ip/reass/ip6_sv_reass.h
@@ -44,6 +44,7 @@ vnet_api_error_t ip6_sv_reass_enable_disable (u32 sw_if_index,
u8 enable_disable);
int ip6_sv_reass_enable_disable_with_refcnt (u32 sw_if_index, int is_enable);
+uword ip6_sv_reass_custom_context_register_next_node (uword node_index);
#endif /* __included_ip6_sv_reass_h */
diff --git a/src/vnet/ip/reass/reassembly.rst b/src/vnet/ip/reass/reassembly.rst
new file mode 100644
index 00000000000..49e0a8de6e6
--- /dev/null
+++ b/src/vnet/ip/reass/reassembly.rst
@@ -0,0 +1,221 @@
+.. _reassembly:
+
+IP Reassembly
+=============
+
+Some VPP functions need access to whole packet and/or stream
+classification based on L4 headers. Reassembly functionality allows
+both former and latter.
+
+Full reassembly vs shallow (virtual) reassembly
+-----------------------------------------------
+
+There are two kinds of reassembly available in VPP:
+
+1. Full reassembly changes a stream of packet fragments into one
+packet containing all data reassembled with fragment bits cleared
+and fragment header stripped (in case of ip6). Note that resulting
+packet may come out of reassembly as a buffer chain. Because it's
+impractical to parse headers which are split over multiple vnet
+buffers, vnet_buffer_chain_linearize() is called after reassembly so
+that L2/L3/L4 headers can be found in first buffer. Full reassembly
+is costly and shouldn't be used unless necessary. Full reassembly is by
+default enabled for both ipv4 and ipv6 "for us" traffic
+- that is packets aimed at VPP addresses. This can be disabled via API
+if desired, in which case "for us" fragments are dropped.
+
+2. Shallow (virtual) reassembly allows various classifying and/or
+translating features to work with fragments without having to
+understand fragmentation. It works by extracting L4 data and adding
+them to vnet_buffer for each packet/fragment passing through SVR
+nodes. This operation is performed for both fragments and regular
+packets, allowing consuming code to treat all packets in same way. SVR
+caches incoming packet fragments (buffers) until first fragment is
+seen. Then it extracts L4 data from that first fragment, fills it for
+any cached fragments and transmits them in the same order as they were
+received. From that point on, any other passing fragments get L4 data
+populated in vnet_buffer based on reassembly context.
+
+Multi-worker behaviour
+^^^^^^^^^^^^^^^^^^^^^^
+
+Both reassembly types deal with fragments arriving on different workers
+via handoff mechanism. All reassembly contexts are stored in pools.
+Bihash mapping 5-tuple key to a value containing pool index and thread
+index is used for lookups. When a lookup finds an existing reassembly on
+a different thread, it hands off the fragment to that thread. If lookup
+fails, a new reassembly context is created and current worker becomes
+owner of that context. Further fragments received on other worker
+threads are then handed off owner worker thread.
+
+Full reassembly also remembers thread index where first fragment (as in
+fragment with fragment offset 0) was seen and uses handoff mechanism to
+send the reassembled packet out on that thread even if pool owner is
+a different thread. This then requires an additional handoff to free
+reassembly context as only pool owner can do that in a thread-safe way.
+
+Limits
+^^^^^^
+
+Because reassembly could be an attack vector, there is a configurable
+limit on the number of concurrent reassemblies and also maximum
+fragments per packet.
+
+Custom applications
+^^^^^^^^^^^^^^^^^^^
+
+Both reassembly features allow to be used by custom application which
+are not part of VPP source tree. Be it patches or 3rd party plugins,
+they can build their own graph paths by using "-custom*" versions of
+nodes. Reassembly then reads next_index and error_next_index for each
+buffer from vnet_buffer, allowing custom application to steer
+both reassembled packets and any packets which are considered an error
+in a way the custom application requires.
+
+Full reassembly
+---------------
+
+Configuration
+^^^^^^^^^^^^^
+
+Configuration is via API (``ip_reassembly_enable_disable``) or CLI:
+
+``set interface reassembly <interface-name> [on|off|ip4|ip6]``
+
+here ``on`` means both ip4 and ip6.
+
+A show command is provided to see reassembly contexts:
+
+For ip4:
+
+``show ip4-full-reassembly [details]``
+
+For ip6:
+
+``show ip6-full-reassembly [details]``
+
+Global full reassembly parameters can be modified using API
+``ip_reassembly_set`` and retrieved using ``ip_reassembly_get``.
+
+Defaults
+""""""""
+
+For defaults values, see #defines in
+
+`ip4_full_reass.c <__REPOSITORY_URL__/src/vnet/ip/reass/ip4_full_reass.c>`_
+
+========================================= ==========================================
+#define description
+----------------------------------------- ------------------------------------------
+IP4_REASS_TIMEOUT_DEFAULT_MS timeout in milliseconds
+IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS interval between reaping expired sessions
+IP4_REASS_MAX_REASSEMBLIES_DEFAULT maximum number of concurrent reassemblies
+IP4_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT maximum number of fragments per reassembly
+========================================= ==========================================
+
+and
+
+`ip6_full_reass.c <__REPOSITORY_URL__/src/vnet/ip/reass/ip6_full_reass.c>`_
+
+========================================= ==========================================
+#define description
+----------------------------------------- ------------------------------------------
+IP6_REASS_TIMEOUT_DEFAULT_MS timeout in milliseconds
+IP6_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS interval between reaping expired sessions
+IP6_REASS_MAX_REASSEMBLIES_DEFAULT maximum number of concurrent reassemblies
+IP6_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT maximum number of fragments per reassembly
+========================================= ==========================================
+
+Finished/expired contexts
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Reassembly contexts are freed either when reassembly is finished - when
+all data has been received or in case of timeout. There is a process
+walking all reassemblies, freeing any expired ones.
+
+Shallow (virtual) reassembly
+----------------------------
+
+Configuration
+^^^^^^^^^^^^^
+
+Configuration is via API (``ip_reassembly_enable_disable``) only as
+there is no value in turning SVR on by hand without a feature consuming
+buffer metadata. SVR is designed to be turned on by a feature requiring
+it in a programmatic way.
+
+A show command is provided to see reassembly contexts:
+
+For ip4:
+
+``show ip4-sv-reassembly [details]``
+
+For ip6:
+
+``show ip6-sv-reassembly [details]``
+
+Global shallow reassembly parameters can be modified using API
+``ip_reassembly_set`` and retrieved using ``ip_reassembly_get``.
+
+Defaults
+""""""""
+
+For defaults values, see #defines in
+
+`ip4_sv_reass.c <__REPOSITORY_URL__/src/vnet/ip/reass/ip4_sv_reass.c>`_
+
+============================================ ==========================================
+#define description
+-------------------------------------------- ------------------------------------------
+IP4_SV_REASS_TIMEOUT_DEFAULT_MS timeout in milliseconds
+IP4_SV_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS interval between reaping expired sessions
+IP4_SV_REASS_MAX_REASSEMBLIES_DEFAULT maximum number of concurrent reassemblies
+IP4_SV_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT maximum number of fragments per reassembly
+============================================ ==========================================
+
+and
+
+`ip6_sv_reass.c <__REPOSITORY_URL__/src/vnet/ip/reass/ip6_sv_reass.c>`_
+
+============================================ ==========================================
+#define description
+-------------------------------------------- ------------------------------------------
+IP6_SV_REASS_TIMEOUT_DEFAULT_MS timeout in milliseconds
+IP6_SV_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS interval between reaping expired sessions
+IP6_SV_REASS_MAX_REASSEMBLIES_DEFAULT maximum number of concurrent reassemblies
+IP6_SV_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT maximum number of fragments per reassembly
+============================================ ==========================================
+
+Expiring contexts
+^^^^^^^^^^^^^^^^^
+
+There is no way of knowing when a reassembly is finished without
+performing (an almost) full reassembly, so contexts in SVR cannot be
+freed in the same way as in full reassembly. Instead a different
+approach is taken. Least recently used (LRU) list is maintained where
+reassembly contexts are ordered based on last update. The oldest
+context is then freed whenever SVR hits limit on number of concurrent
+reassembly contexts. There is also a process reaping expired sessions
+similar as in full reassembly.
+
+Truncated packets
+^^^^^^^^^^^^^^^^^
+
+When SVR detects that a packet has been truncated in a way where L4
+headers are not available, it will mark it as such in vnet_buffer,
+allowing downstream features to handle such packets as they deem fit.
+
+Fast path/slow path
+^^^^^^^^^^^^^^^^^^^
+
+SVR runs is implemented fast path/slow path way. By default, it assumes
+that any passing traffic doesn't contain fragments, processing buffers
+in a dual-loop. If it sees a fragment, it then jumps to single-loop
+processing.
+
+Feature enabled by other features/reference counting
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+SVR feature is enabled by some other features, like NAT, when those
+features are enabled. For this to work, it implements a reference
+counted API for enabling/disabling SVR.