summaryrefslogtreecommitdiffstats
path: root/src/plugins/lacp/ptx_machine.h
diff options
context:
space:
mode:
authorSteven <sluong@cisco.com>2017-12-20 12:43:01 -0800
committerDamjan Marion <dmarion.lists@gmail.com>2018-03-21 21:02:15 +0000
commit9cd2d7a5a4fafadb65d772c48109d55d1e19d425 (patch)
tree4a9e0665be0096ee6bfc2235388f90b276b23814 /src/plugins/lacp/ptx_machine.h
parent43ebe29b6ea1107c30311cfb3dbd8190282903d0 (diff)
bond: Add bonding driver and LACP protocol
Add bonding driver to support creation of bond interface which composes of multiple slave interfaces. The slave interfaces could be physical interfaces, or just any virtual interfaces. For example, memif interfaces. The syntax to create a bond interface is create bond mode <lacp | xor | acitve-backup | broadcast | round-robin> To enslave an interface to the bond interface, enslave interface TenGigabitEthernet6/0/0 to BondEthernet0 Please see src/plugins/lacp/lacp_doc.md for more examples and additional options. LACP is a control plane protocol which manages and monitors the status of the slave interfaces. The protocol is part of 802.3ad standard. This patch implements LACPv1. LACPv2 is not supported. To enable LACP on the bond interface, specify "mode lacp" when the bond interface is created. The syntax to enslave a slave interface is the same as other bonding modes. Change-Id: I06581d3b87635972f9f0e1ec50b67560fc13e26c Signed-off-by: Steven <sluong@cisco.com>
Diffstat (limited to 'src/plugins/lacp/ptx_machine.h')
-rw-r--r--src/plugins/lacp/ptx_machine.h80
1 files changed, 80 insertions, 0 deletions
diff --git a/src/plugins/lacp/ptx_machine.h b/src/plugins/lacp/ptx_machine.h
new file mode 100644
index 00000000000..a9af4bb89d3
--- /dev/null
+++ b/src/plugins/lacp/ptx_machine.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2017 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ___LACP_PTX_MACHINE_H__
+#define ___LACP_PTX_MACHINE_H__
+
+#include <stdint.h>
+#include <lacp/machine.h>
+
+#define foreach_lacp_ptx_event \
+ _(0, BEGIN, "begin") \
+ _(1, LONG_TIMEOUT, "long tiemout") \
+ _(2, TIMER_EXPIRED, "timer expired") \
+ _(3, SHORT_TIMEOUT, "short timeout")
+
+typedef enum
+{
+#define _(a, b, c) LACP_PTX_EVENT_##b = (a),
+ foreach_lacp_ptx_event
+#undef _
+} lacp_ptx_event_t;
+
+#define foreach_lacp_ptx_sm_state \
+ _(0, NO_PERIODIC, "no periodic") \
+ _(1, FAST_PERIODIC, "fast periodic") \
+ _(2, SLOW_PERIODIC, "slow periodic") \
+ _(3, PERIODIC_TX, "periodic transmission")
+
+typedef enum
+{
+#define _(a, b, c) LACP_PTX_STATE_##b = (a),
+ foreach_lacp_ptx_sm_state
+#undef _
+} lacp_ptx_sm_state_t;
+
+extern lacp_machine_t lacp_ptx_machine;
+
+int lacp_ptx_action_no_periodic (void *p1, void *p2);
+int lacp_ptx_action_slow_periodic (void *p1, void *p2);
+int lacp_ptx_action_fast_periodic (void *p1, void *p2);
+int lacp_ptx_action_timer_expired (void *p1, void *p2);
+void lacp_ptx_debug_func (slave_if_t * sif, int event, int state,
+ lacp_fsm_state_t * transition);
+
+#define LACP_ACTION_NO_PERIODIC \
+ LACP_ACTION_ROUTINE(lacp_ptx_action_no_periodic)
+#define LACP_ACTION_SLOW_PERIODIC \
+ LACP_ACTION_ROUTINE(lacp_ptx_action_slow_periodic)
+#define LACP_ACTION_FAST_PERIODIC \
+ LACP_ACTION_ROUTINE(lacp_ptx_action_fast_periodic)
+#define LACP_ACTION_TIMER_EXPIRED \
+ LACP_ACTION_ROUTINE(lacp_ptx_action_timer_expired)
+
+static inline void
+lacp_start_periodic_timer (vlib_main_t * vm, slave_if_t * sif, u8 expiration)
+{
+ sif->periodic_timer = vlib_time_now (vm) + expiration;
+}
+
+#endif /* __LACP_PTX_MACHINE_H__ */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
highlight .bp { color: #003388 } /* Name.Builtin.Pseudo */ .highlight .fm { color: #0066bb; font-weight: bold } /* Name.Function.Magic */ .highlight .vc { color: #336699 } /* Name.Variable.Class */ .highlight .vg { color: #dd7700 } /* Name.Variable.Global */ .highlight .vi { color: #3333bb } /* Name.Variable.Instance */ .highlight .vm { color: #336699 } /* Name.Variable.Magic */ .highlight .il { color: #0000DD; font-weight: bold } /* Literal.Number.Integer.Long */ }
# VPP MAP and Lw4o6 implementation    {#map_doc}

This is a memo intended to contain documentation of the VPP MAP and Lw4o6 implementations.
Everything that is not directly obvious should come here.



## MAP-E Virtual Reassembly

The MAP-E implementation supports handling of IPv4 fragments as well as IPv4-in-IPv6 inner and outer fragments. This is called virtual reassembly because the fragments are not actually reassembled. Instead, some meta-data are kept about the first fragment and reused for subsequent fragments.

Fragment caching and handling is not always necessary. It is performed when:
* An IPv4 fragment is received and the destination IPv4 address is shared.
* An IPv6 packet is received with an inner IPv4 fragment, the IPv4 source address is shared, and 'security-check fragments' is on.
* An IPv6 fragment is received.

There are 3 dedicated nodes:
* ip4-map-reass 
* ip6-map-ip4-reass
* ip6-map-ip6-reass

ip4-map sends all fragments to ip4-map-reass.
ip6-map sends all inner-fragments to ip6-map-ip4-reass.
ip6-map sends all outer-fragments to ip6-map-ip6-reass.

IPv4 (resp. IPv6) virtual reassembly makes use of a hash table in order to store IPv4 (resp. IPv6) reassembly structures. The hash-key is based on the IPv4-src:IPv4-dst:Frag-ID:Protocol tuple (resp. IPv6-src:IPv6-dst:Frag-ID tuple, as the protocol is IPv4-in-IPv6). Therefore, each packet reassembly makes use of exactly one reassembly structure. When such a structure is allocated, it is timestamped with the current time. Finally, those structures are capable of storing a limited number of buffer indexes.

An IPv4 (resp. IPv6) reassembly structure can cache up to MAP_IP4_REASS_MAX_FRAGMENTS_PER_REASSEMBLY (resp. MAP_IP6_REASS_MAX_FRAGMENTS_PER_REASSEMBLY) buffers. Buffers are cached until the first fragment is received.

#### Virtual Reassembly configuration

IPv4 and IPv6 virtual reassembly support the following configuration:
    map params reassembly [ip4 | ip6] [lifetime <lifetime-ms>] [pool-size <pool-size>] [buffers <buffers>] [ht-ratio <ht-ratio>]

lifetime: 
	The time in milliseconds a reassembly structure is considered valid. The longer, the more reliable is reassembly, but the more likely it is to exhaust the pool of reassembly structures. IPv4 standard suggests a lifetime of 15 seconds. IPv6 specifies a lifetime of 60 people. Those values are not realistic for high-throughput cases.

buffers:
	The upper limit of buffers that are allowed to be cached. It can be used to protect against fragmentation attacks which would aim to exhaust the global buffers pool.
	
pool-size:
	The number of reassembly structures that can be allocated. As each structure can store a small fixed number of fragments, it also sets an upper-bound of 'pool-size * MAP_IPX_REASS_MAX_FRAGMENTS_PER_REASSEMBLY' buffers that can be cached in total.
	
ht-ratio:
	The amount of buckets in the hash-table is pool-size * ht-ratio.


Any time pool-size and ht-ratio is modified, the hash-table is destroyed and created again, which means all current state is lost.


##### Additional considerations

Reassembly at high rate is expensive in terms of buffers. There is a trade-off between the lifetime and number of allocated buffers. Reducing the lifetime helps, but at the cost of loosing state for fragments that are wide appart.

Let:
R be the packet rate at which fragments are received.
F be the number of fragments per packet.

Assuming the first fragment is always received last. We should have:
buffers > lifetime * R / F * (F - 1)
pool-size > lifetime * R/F

This is a worst case. Receiving the first fragment earlier helps reducing the number of required buffers. Also, an optimization is implemented (MAP_IP6_REASS_COUNT_BYTES and MAP_IP4_REASS_COUNT_BYTES) which counts the number of transmitted bytes and remembers the total number of bytes which should be transmitted based on the last fragment, and therefore helps reducing 'pool-size'.

But the formula shows that it is challenging to forward a significant amount of fragmented packets at high rates. For instance, with a lifetime of 1 second, 5Mpps packet rate would require buffering up to 2.5 millions fragments.

If you want to do that, be prepared to configure a lot of fragments.