summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHongjun Ni <hongjun.ni@intel.com>2018-01-23 19:17:23 +0800
committerHongjun Ni <hongjun.ni@intel.com>2018-02-02 02:20:46 +0000
commit647f609a11e2afb91a5216ca99d0705a3e1212a7 (patch)
tree149ab0443d42a8d8a7620c8bce917ec87edf1ae7
parent91389ac2c28ae10f2b7f766e4dfe7a7fd96dc5e0 (diff)
Add L3DSR feature in LB plugin
L3DSR is used to overcome Layer 2 limitations of Direct Server Return Load Balancing. It maps VIP to DSCP bits, and reuse TOS bits to transfer it to server, and then server will get VIP from DSCP-to-VIP mapping. Please refer to https://www.nanog.org/meetings/nanog51/presentations/Monday/NANOG51.Talk45.nanog51-Schaumann.pdf Change-Id: I403ffeadfb04ed0265086eb2dc41f2e17f8f34cb Signed-off-by: Hongjun Ni <hongjun.ni@intel.com>
-rw-r--r--src/plugins/lb/api.c21
-rw-r--r--src/plugins/lb/cli.c42
-rw-r--r--src/plugins/lb/lb.api14
-rw-r--r--src/plugins/lb/lb.c60
-rw-r--r--src/plugins/lb/lb.h38
-rw-r--r--src/plugins/lb/lb_plugin_doc.md33
-rw-r--r--src/plugins/lb/lb_test.c8
-rw-r--r--src/plugins/lb/node.c118
-rw-r--r--test/test_lb.py44
9 files changed, 282 insertions, 96 deletions
diff --git a/src/plugins/lb/api.c b/src/plugins/lb/api.c
index 7eb49ff6c26..28af6daa421 100644
--- a/src/plugins/lb/api.c
+++ b/src/plugins/lb/api.c
@@ -116,14 +116,23 @@ vl_api_lb_add_del_vip_t_handler
rv = lb_vip_del(vip_index);
} else {
u32 vip_index;
- lb_vip_type_t type;
+ lb_vip_type_t type = 0;
+
if (ip46_prefix_is_ip4(&prefix, mp->prefix_length)) {
- type = mp->is_gre4?LB_VIP_TYPE_IP4_GRE4:LB_VIP_TYPE_IP4_GRE6;
+ if (mp->encap == LB_ENCAP_TYPE_GRE4)
+ type = LB_VIP_TYPE_IP4_GRE4;
+ else if (mp->encap == LB_ENCAP_TYPE_GRE6)
+ type = LB_VIP_TYPE_IP4_GRE6;
+ else if (mp->encap == LB_ENCAP_TYPE_L3DSR)
+ type = LB_VIP_TYPE_IP4_L3DSR;
} else {
- type = mp->is_gre4?LB_VIP_TYPE_IP6_GRE4:LB_VIP_TYPE_IP6_GRE6;
+ if (mp->encap == LB_ENCAP_TYPE_GRE4)
+ type = LB_VIP_TYPE_IP6_GRE4;
+ else if (mp->encap == LB_ENCAP_TYPE_GRE6)
+ type = LB_VIP_TYPE_IP6_GRE6;
}
- rv = lb_vip_add(&prefix, mp->prefix_length, type,
+ rv = lb_vip_add(&prefix, mp->prefix_length, type, mp->dscp,
mp->new_flows_table_length, &vip_index);
}
REPLY_MACRO (VL_API_LB_CONF_REPLY);
@@ -136,7 +145,9 @@ static void *vl_api_lb_add_del_vip_t_print
s = format (0, "SCRIPT: lb_add_del_vip ");
s = format (s, "%U ", format_ip46_prefix,
(ip46_address_t *)mp->ip_prefix, mp->prefix_length, IP46_TYPE_ANY);
- s = format (s, "%s ", mp->is_gre4?"gre4":"gre6");
+
+ s = format (s, "%s ", (mp->encap==LB_ENCAP_TYPE_GRE4)?
+ "gre4":(mp->encap==LB_ENCAP_TYPE_GRE6)?"gre6":"l3dsr");
s = format (s, "%u ", mp->new_flows_table_length);
s = format (s, "%s ", mp->is_del?"del":"add");
FINISH;
diff --git a/src/plugins/lb/cli.c b/src/plugins/lb/cli.c
index a5a87fccc5f..b29605af984 100644
--- a/src/plugins/lb/cli.c
+++ b/src/plugins/lb/cli.c
@@ -26,8 +26,9 @@ lb_vip_command_fn (vlib_main_t * vm,
u32 new_len = 1024;
u8 del = 0;
int ret;
- u32 gre4 = 0;
- lb_vip_type_t type;
+ u32 encap = 0;
+ u32 dscp = ~0;
+ lb_vip_type_t type = 0;
clib_error_t *error = 0;
if (!unformat_user (input, unformat_line_input, line_input))
@@ -46,9 +47,13 @@ lb_vip_command_fn (vlib_main_t * vm,
else if (unformat(line_input, "del"))
del = 1;
else if (unformat(line_input, "encap gre4"))
- gre4 = 1;
+ encap = LB_ENCAP_TYPE_GRE4;
else if (unformat(line_input, "encap gre6"))
- gre4 = 0;
+ encap = LB_ENCAP_TYPE_GRE6;
+ else if (unformat(line_input, "encap l3dsr"))
+ encap = LB_ENCAP_TYPE_L3DSR;
+ else if (unformat(line_input, "dscp %d", &dscp))
+ ;
else {
error = clib_error_return (0, "parse error: '%U'",
format_unformat_error, line_input);
@@ -56,18 +61,39 @@ lb_vip_command_fn (vlib_main_t * vm,
}
}
+ if ((encap != LB_ENCAP_TYPE_L3DSR) && (dscp != ~0) )
+ {
+ error = clib_error_return (0, "lb_vip_add error: "
+ "should not configure dscp for none L3DSR.");
+ goto done;
+ }
+
+ if ((encap == LB_ENCAP_TYPE_L3DSR) && (dscp >= 64 ) )
+ {
+ error = clib_error_return (0, "lb_vip_add error: "
+ "dscp for L3DSR should be less than 64.");
+ goto done;
+ }
if (ip46_prefix_is_ip4(&prefix, plen)) {
- type = (gre4)?LB_VIP_TYPE_IP4_GRE4:LB_VIP_TYPE_IP4_GRE6;
+ if (encap == LB_ENCAP_TYPE_GRE4)
+ type = LB_VIP_TYPE_IP4_GRE4;
+ else if (encap == LB_ENCAP_TYPE_GRE6)
+ type = LB_VIP_TYPE_IP4_GRE6;
+ else if (encap == LB_ENCAP_TYPE_L3DSR)
+ type = LB_VIP_TYPE_IP4_L3DSR;
} else {
- type = (gre4)?LB_VIP_TYPE_IP6_GRE4:LB_VIP_TYPE_IP6_GRE6;
+ if (encap == LB_ENCAP_TYPE_GRE4)
+ type = LB_VIP_TYPE_IP6_GRE4;
+ else if (encap == LB_ENCAP_TYPE_GRE6)
+ type = LB_VIP_TYPE_IP6_GRE6;
}
lb_garbage_collection();
u32 index;
if (!del) {
- if ((ret = lb_vip_add(&prefix, plen, type, new_len, &index))) {
+ if ((ret = lb_vip_add(&prefix, plen, type, (u8)(dscp & 0x3F), new_len, &index))) {
error = clib_error_return (0, "lb_vip_add error %d", ret);
goto done;
} else {
@@ -92,7 +118,7 @@ done:
VLIB_CLI_COMMAND (lb_vip_command, static) =
{
.path = "lb vip",
- .short_help = "lb vip <prefix> [encap (gre6|gre4)] [new_len <n>] [del]",
+ .short_help = "lb vip <prefix> [encap (gre6|gre4|l3dsr)] [dscp <n>] [new_len <n>] [del]",
.function = lb_vip_command_fn,
};
diff --git a/src/plugins/lb/lb.api b/src/plugins/lb/lb.api
index f5036edf79d..101cee88ded 100644
--- a/src/plugins/lb/lb.api
+++ b/src/plugins/lb/lb.api
@@ -23,9 +23,10 @@ autoreply define lb_conf
/** \brief Add a virtual address (or prefix)
@param client_index - opaque cookie to identify the sender
@param context - sender context, to match reply w/ request
- @param ip_prefix - IP address (IPv4 in lower order 32 bits).
- @param prefix_length - IP prefix length (96 + 'IPv4 prefix length' for IPv4).
- @param is_gre4 - Encap is ip4 GRE (ip6 GRE otherwise).
+ @param ip_prefix - IP address (IPv4 in lower order 32 bits).
+ @param prefix_length - IP prefix length (96 + 'IPv4 prefix length' for IPv4).
+ @param encap - Encap is ip4 GRE(0) or ip6 GRE(1) or L3DSR(2).
+ @param dscp - DSCP bit corresponding to VIP(applicable in L3DSR mode only).
@param new_flows_table_length - Size of the new connections flow table used
for this VIP (must be power of 2).
@param is_del - The VIP should be removed.
@@ -35,7 +36,8 @@ autoreply define lb_add_del_vip {
u32 context;
u8 ip_prefix[16];
u8 prefix_length;
- u8 is_gre4;
+ u8 encap;
+ u8 dscp;
u32 new_flows_table_length;
u8 is_del;
};
@@ -43,8 +45,8 @@ autoreply define lb_add_del_vip {
/** \brief Add an application server for a given VIP
@param client_index - opaque cookie to identify the sender
@param context - sender context, to match reply w/ request
- @param vip_ip_prefix - VIP IP address (IPv4 in lower order 32 bits).
- @param vip_ip_prefix - VIP IP prefix length (96 + 'IPv4 prefix length' for IPv4).
+ @param vip_ip_prefix - VIP IP address (IPv4 in lower order 32 bits).
+ @param vip_ip_prefix - VIP IP prefix length (96 + 'IPv4 prefix length' for IPv4).
@param as_address - The application server address (IPv4 in lower order 32 bits).
@param is_del - The AS should be removed.
*/
diff --git a/src/plugins/lb/lb.c b/src/plugins/lb/lb.c
index fee88056eb4..06953a45aaa 100644
--- a/src/plugins/lb/lb.c
+++ b/src/plugins/lb/lb.c
@@ -48,6 +48,12 @@ const static char* const * const lb_dpo_gre6_nodes[DPO_PROTO_NUM] =
[DPO_PROTO_IP6] = lb_dpo_gre6_ip6,
};
+const static char * const lb_dpo_l3dsr_ip4[] = { "lb4-l3dsr" , NULL };
+const static char* const * const lb_dpo_l3dsr_nodes[DPO_PROTO_NUM] =
+ {
+ [DPO_PROTO_IP4] = lb_dpo_l3dsr_ip4,
+ };
+
u32 lb_hash_time_now(vlib_main_t * vm)
{
return (u32) (vlib_time_now(vm) + 10000);
@@ -81,6 +87,7 @@ static char *lb_vip_type_strings[] = {
[LB_VIP_TYPE_IP6_GRE4] = "ip6-gre4",
[LB_VIP_TYPE_IP4_GRE6] = "ip4-gre6",
[LB_VIP_TYPE_IP4_GRE4] = "ip4-gre4",
+ [LB_VIP_TYPE_IP4_L3DSR] = "ip4-l3dsr",
};
u8 *format_lb_vip_type (u8 * s, va_list * args)
@@ -140,6 +147,13 @@ u8 *format_lb_vip_detailed (u8 * s, va_list * args)
format_white_space, indent,
vip->new_flow_table_mask + 1);
+ if (vip->type == LB_VIP_TYPE_IP4_L3DSR)
+ {
+ s = format(s, "%U dscp:%u\n",
+ format_white_space, indent,
+ vip->dscp);
+ }
+
//Print counters
s = format(s, "%U counters:\n",
format_white_space, indent);
@@ -434,7 +448,7 @@ int lb_vip_add_ass(u32 vip_index, ip46_address_t *addresses, u32 n)
return VNET_API_ERROR_NO_SUCH_ENTRY;
}
- ip46_type_t type = lb_vip_is_gre4(vip)?IP46_TYPE_IP4:IP46_TYPE_IP6;
+ ip46_type_t type = lb_encap_is_ip4(vip)?IP46_TYPE_IP4:IP46_TYPE_IP6;
u32 *to_be_added = 0;
u32 *to_be_updated = 0;
u32 i;
@@ -497,7 +511,7 @@ next:
* so we are informed when its forwarding changes
*/
fib_prefix_t nh = {};
- if (lb_vip_is_gre4(vip)) {
+ if (lb_encap_is_ip4(vip)) {
nh.fp_addr.ip4 = as->address.ip4;
nh.fp_len = 32;
nh.fp_proto = FIB_PROTOCOL_IP4;
@@ -595,6 +609,8 @@ int lb_vip_del_ass(u32 vip_index, ip46_address_t *addresses, u32 n)
static void lb_vip_add_adjacency(lb_main_t *lbm, lb_vip_t *vip)
{
dpo_proto_t proto = 0;
+ dpo_type_t dpo_type = 0;
+
dpo_id_t dpo = DPO_INVALID;
fib_prefix_t pfx = {};
if (lb_vip_is_ip4(vip)) {
@@ -608,8 +624,15 @@ static void lb_vip_add_adjacency(lb_main_t *lbm, lb_vip_t *vip)
pfx.fp_proto = FIB_PROTOCOL_IP6;
proto = DPO_PROTO_IP6;
}
- dpo_set(&dpo, lb_vip_is_gre4(vip)?lbm->dpo_gre4_type:lbm->dpo_gre6_type,
- proto, vip - lbm->vips);
+
+ if(lb_vip_is_gre4(vip))
+ dpo_type = lbm->dpo_gre4_type;
+ else if (lb_vip_is_gre6(vip))
+ dpo_type = lbm->dpo_gre6_type;
+ else if (lb_vip_is_l3dsr(vip))
+ dpo_type = lbm->dpo_l3dsr_type;
+
+ dpo_set(&dpo, dpo_type, proto, vip - lbm->vips);
fib_table_entry_special_dpo_add(0,
&pfx,
FIB_SOURCE_PLUGIN_HI,
@@ -636,10 +659,12 @@ static void lb_vip_del_adjacency(lb_main_t *lbm, lb_vip_t *vip)
fib_table_entry_special_remove(0, &pfx, FIB_SOURCE_PLUGIN_HI);
}
-int lb_vip_add(ip46_address_t *prefix, u8 plen, lb_vip_type_t type, u32 new_length, u32 *vip_index)
+int lb_vip_add(ip46_address_t *prefix, u8 plen, lb_vip_type_t type, u8 dscp,
+ u32 new_length, u32 *vip_index)
{
lb_main_t *lbm = &lb_main;
lb_vip_t *vip;
+
lb_get_writer_lock();
ip46_prefix_normalize(prefix, plen);
@@ -655,9 +680,19 @@ int lb_vip_add(ip46_address_t *prefix, u8 plen, lb_vip_type_t type, u32 new_leng
if (ip46_prefix_is_ip4(prefix, plen) &&
(type != LB_VIP_TYPE_IP4_GRE4) &&
- (type != LB_VIP_TYPE_IP4_GRE6))
+ (type != LB_VIP_TYPE_IP4_GRE6) &&
+ (type != LB_VIP_TYPE_IP4_L3DSR))
+ return VNET_API_ERROR_INVALID_ADDRESS_FAMILY;
+
+ if ((!ip46_prefix_is_ip4(prefix, plen)) &&
+ (type != LB_VIP_TYPE_IP6_GRE4) &&
+ (type != LB_VIP_TYPE_IP6_GRE6))
return VNET_API_ERROR_INVALID_ADDRESS_FAMILY;
+ if ((type == LB_VIP_TYPE_IP4_L3DSR) && (dscp >= 64 ) )
+ {
+ return VNET_API_ERROR_VALUE_EXIST;
+ }
//Allocate
pool_get(lbm->vips, vip);
@@ -667,6 +702,7 @@ int lb_vip_add(ip46_address_t *prefix, u8 plen, lb_vip_type_t type, u32 new_leng
vip->plen = plen;
vip->last_garbage_collection = (u32) vlib_time_now(vlib_get_main());
vip->type = type;
+ vip->dscp = dscp;
vip->flags = LB_VIP_FLAGS_USED;
vip->as_indexes = 0;
@@ -775,7 +811,16 @@ lb_as_stack (lb_as_t *as)
{
lb_main_t *lbm = &lb_main;
lb_vip_t *vip = &lbm->vips[as->vip_index];
- dpo_stack(lb_vip_is_gre4(vip)?lbm->dpo_gre4_type:lbm->dpo_gre6_type,
+ dpo_type_t dpo_type = 0;
+
+ if(lb_vip_is_gre4(vip))
+ dpo_type = lbm->dpo_gre4_type;
+ else if (lb_vip_is_gre6(vip))
+ dpo_type = lbm->dpo_gre6_type;
+ else if (lb_vip_is_l3dsr(vip))
+ dpo_type = lbm->dpo_l3dsr_type;
+
+ dpo_stack(dpo_type,
lb_vip_is_ip4(vip)?DPO_PROTO_IP4:DPO_PROTO_IP6,
&as->dpo,
fib_entry_contribute_ip_forwarding(
@@ -819,6 +864,7 @@ lb_init (vlib_main_t * vm)
lbm->ip6_src_address.as_u64[1] = 0xffffffffffffffffL;
lbm->dpo_gre4_type = dpo_register_new_type(&lb_vft, lb_dpo_gre4_nodes);
lbm->dpo_gre6_type = dpo_register_new_type(&lb_vft, lb_dpo_gre6_nodes);
+ lbm->dpo_l3dsr_type = dpo_register_new_type(&lb_vft, lb_dpo_l3dsr_nodes);
lbm->fib_node_type = fib_node_register_new_type(&lb_fib_node_vft);
//Init AS reference counters
diff --git a/src/plugins/lb/lb.h b/src/plugins/lb/lb.h
index fa0b5d48b07..8db0394075c 100644
--- a/src/plugins/lb/lb.h
+++ b/src/plugins/lb/lb.h
@@ -37,6 +37,7 @@
#include <vnet/ip/ip.h>
#include <vnet/dpo/dpo.h>
#include <vnet/fib/fib_table.h>
+#include <vppinfra/hash.h>
#include <lb/lbhash.h>
@@ -128,18 +129,27 @@ typedef enum {
LB_N_VIP_COUNTERS
} lb_vip_counter_t;
+typedef enum {
+ LB_ENCAP_TYPE_GRE4,
+ LB_ENCAP_TYPE_GRE6,
+ LB_ENCAP_TYPE_L3DSR,
+ LB_ENCAP_N_TYPES,
+} lb_encap_type_t;
+
/**
* The load balancer supports IPv4 and IPv6 traffic
- * and GRE4 and GRE6 encap.
+ * and GRE4, GRE6 and L3DSR encap.
*/
typedef enum {
LB_VIP_TYPE_IP6_GRE6,
LB_VIP_TYPE_IP6_GRE4,
LB_VIP_TYPE_IP4_GRE6,
LB_VIP_TYPE_IP4_GRE4,
+ LB_VIP_TYPE_IP4_L3DSR,
LB_VIP_N_TYPES,
} lb_vip_type_t;
+
format_function_t format_lb_vip_type;
unformat_function_t unformat_lb_vip_type;
@@ -196,6 +206,11 @@ typedef struct {
lb_vip_type_t type;
/**
+ * DSCP bits for L3DSR
+ */
+ u8 dscp;
+
+ /**
* Flags related to this VIP.
* LB_VIP_FLAGS_USED means the VIP is active.
* When it is not set, the VIP in the process of being removed.
@@ -212,8 +227,20 @@ typedef struct {
u32 *as_indexes;
} lb_vip_t;
-#define lb_vip_is_ip4(vip) ((vip)->type == LB_VIP_TYPE_IP4_GRE6 || (vip)->type == LB_VIP_TYPE_IP4_GRE4)
-#define lb_vip_is_gre4(vip) ((vip)->type == LB_VIP_TYPE_IP6_GRE4 || (vip)->type == LB_VIP_TYPE_IP4_GRE4)
+#define lb_vip_is_ip4(vip) ((vip)->type == LB_VIP_TYPE_IP4_GRE6 \
+ || (vip)->type == LB_VIP_TYPE_IP4_GRE4 \
+ || (vip)->type == LB_VIP_TYPE_IP4_L3DSR )
+
+#define lb_vip_is_gre4(vip) ((vip)->type == LB_VIP_TYPE_IP6_GRE4 \
+ || (vip)->type == LB_VIP_TYPE_IP4_GRE4)
+#define lb_vip_is_gre6(vip) ((vip)->type == LB_VIP_TYPE_IP6_GRE6 \
+ || (vip)->type == LB_VIP_TYPE_IP4_GRE6)
+#define lb_vip_is_l3dsr(vip) ((vip)->type == LB_VIP_TYPE_IP4_L3DSR)
+
+#define lb_encap_is_ip4(vip) ((vip)->type == LB_VIP_TYPE_IP6_GRE4 \
+ || (vip)->type == LB_VIP_TYPE_IP4_GRE4 \
+ || (vip)->type == LB_VIP_TYPE_IP4_L3DSR)
+
format_function_t format_lb_vip;
format_function_t format_lb_vip_detailed;
@@ -286,6 +313,7 @@ typedef struct {
*/
dpo_type_t dpo_gre4_type;
dpo_type_t dpo_gre6_type;
+ dpo_type_t dpo_l3dsr_type;
/**
* Node type for registering to fib changes.
@@ -313,8 +341,8 @@ extern vlib_node_registration_t lb4_node;
int lb_conf(ip4_address_t *ip4_address, ip6_address_t *ip6_address,
u32 sticky_buckets, u32 flow_timeout);
-int lb_vip_add(ip46_address_t *prefix, u8 plen, lb_vip_type_t type,
- u32 new_length, u32 *vip_index);
+int lb_vip_add(ip46_address_t *prefix, u8 plen, lb_vip_type_t type, u8 dscp,
+ u32 new_length, u32 *vip_index);
int lb_vip_del(u32 vip_index);
int lb_vip_find_index(ip46_address_t *prefix, u8 plen, u32 *vip_index);
diff --git a/src/plugins/lb/lb_plugin_doc.md b/src/plugins/lb/lb_plugin_doc.md
index c7885ffb837..7672b1e88d7 100644
--- a/src/plugins/lb/lb_plugin_doc.md
+++ b/src/plugins/lb/lb_plugin_doc.md
@@ -8,19 +8,26 @@ Wich also means feedback is really welcome regarding features, apis, etc...
## Overview
-This plugin provides load balancing for VPP in a way that is largely inspired
+This plugin provides load balancing for VPP in a way that is largely inspired
from Google's MagLev: http://research.google.com/pubs/pub44824.html
-The load balancer is configured with a set of Virtual IPs (VIP, which can be
+The load balancer is configured with a set of Virtual IPs (VIP, which can be
prefixes), and for each VIP, with a set of Application Server addresses (ASs).
+There are four encap types to steer traffic to different ASs:
+1). IPv4+GRE ad IPv6+GRE encap types:
Traffic received for a given VIP (or VIP prefix) is tunneled using GRE towards
-the different ASs in a way that (tries to) ensure that a given session will
+the different ASs in a way that (tries to) ensure that a given session will
always be tunneled to the same AS.
+2). IPv4+L3DSR encap types:
+L3DSR is used to overcome Layer 2 limitations of Direct Server Return Load Balancing.
+It maps VIP to DSCP bits, and reuse TOS bits to transfer DSCP bits
+to server, and then server will get VIP from DSCP-to-VIP mapping.
+
Both VIPs or ASs can be IPv4 or IPv6, but for a given VIP, all ASs must be using
-the same encap. type (i.e. IPv4+GRE or IPv6+GRE). Meaning that for a given VIP,
-all AS addresses must be of the same family.
+the same encap. type (i.e. IPv4+GRE or IPv6+GRE or IPv4+L3DSR).
+Meaning that for a given VIP, all AS addresses must be of the same family.
## Performances
@@ -35,34 +42,36 @@ in next versions.
The load balancer needs to be configured with some parameters:
- lb conf [ip4-src-address <addr>] [ip6-src-address <addr>]
+ lb conf [ip4-src-address <addr>] [ip6-src-address <addr>]
[buckets <n>] [timeout <s>]
-
+
ip4-src-address: the source address used to send encap. packets using IPv4.
ip6-src-address: the source address used to send encap. packets using IPv6.
buckets: the *per-thread* established-connexions-table number of buckets.
-timeout: the number of seconds a connection will remain in the
+timeout: the number of seconds a connection will remain in the
established-connexions-table while no packet for this flow
is received.
-
### Configure the VIPs
- lb vip <prefix> [encap (gre6|gre4)] [new_len <n>] [del]
-
+ lb vip <prefix> [encap (gre6|gre4|l3dsr)] [dscp <n>] [new_len <n>] [del]
+
new_len is the size of the new-connection-table. It should be 1 or 2 orders of
magnitude bigger than the number of ASs for the VIP in order to ensure a good
load balancing.
+Encap l3dsr and dscp is used to map VIP to dscp bit and rewrite DSCP bit in packets.
+So the selected server could get VIP from DSCP bit in this packet and perform DSR.
Examples:
-
+
lb vip 2002::/16 encap gre6 new_len 1024
lb vip 2003::/16 encap gre4 new_len 2048
lb vip 80.0.0.0/8 encap gre6 new_len 16
lb vip 90.0.0.0/8 encap gre4 new_len 1024
+ lb vip 100.0.0.0/8 encap l3dsr dscp 2 new_len 32
### Configure the ASs (for each VIP)
diff --git a/src/plugins/lb/lb_test.c b/src/plugins/lb/lb_test.c
index 35bda262fee..b02793944c5 100644
--- a/src/plugins/lb/lb_test.c
+++ b/src/plugins/lb/lb_test.c
@@ -157,7 +157,7 @@ static int api_lb_add_del_vip (vat_main_t * vam)
vl_api_lb_add_del_vip_t mps, *mp;
int ret;
mps.is_del = 0;
- mps.is_gre4 = 0;
+ mps.encap = LB_ENCAP_TYPE_GRE4;
if (!unformat(i, "%U",
unformat_ip46_prefix, mps.ip_prefix, &mps.prefix_length, IP46_TYPE_ANY)) {
@@ -166,9 +166,11 @@ static int api_lb_add_del_vip (vat_main_t * vam)
}
if (unformat(i, "gre4")) {
- mps.is_gre4 = 1;
+ mps.encap = LB_ENCAP_TYPE_GRE4;
} else if (unformat(i, "gre6")) {
- mps.is_gre4 = 0;
+ mps.encap = LB_ENCAP_TYPE_GRE6;
+ } else if (unformat(i, "l3dsr")) {
+ mps.encap = LB_ENCAP_TYPE_L3DSR;
} else {
errmsg ("no encap\n");
return -99;
diff --git a/src/plugins/lb/node.c b/src/plugins/lb/node.c
index 4a7485eb835..22ba3104f92 100644
--- a/src/plugins/lb/node.c
+++ b/src/plugins/lb/node.c
@@ -149,7 +149,7 @@ static_always_inline uword
lb_node_fn (vlib_main_t * vm,
vlib_node_runtime_t * node, vlib_frame_t * frame,
u8 is_input_v4, //Compile-time parameter stating that is input is v4 (or v6)
- u8 is_encap_v4) //Compile-time parameter stating that is GRE encap is v4 (or v6)
+ lb_encap_type_t encap_type) //Compile-time parameter stating that is GRE4 or GRE6 or L3DSR
{
lb_main_t *lbm = &lb_main;
u32 n_left_from, *from, next_index, *to_next, n_left_to_next;
@@ -265,43 +265,54 @@ lb_node_fn (vlib_main_t * vm,
1);
//Now let's encap
- {
- gre_header_t *gre0;
- if (is_encap_v4)
- {
- ip4_header_t *ip40;
- vlib_buffer_advance(p0, - sizeof(ip4_header_t) - sizeof(gre_header_t));
- ip40 = vlib_buffer_get_current(p0);
- gre0 = (gre_header_t *)(ip40 + 1);
- ip40->src_address = lbm->ip4_src_address;
- ip40->dst_address = lbm->ass[asindex0].address.ip4;
- ip40->ip_version_and_header_length = 0x45;
- ip40->ttl = 128;
- ip40->fragment_id = 0;
- ip40->flags_and_fragment_offset = 0;
- ip40->length = clib_host_to_net_u16(len0 + sizeof(gre_header_t) + sizeof(ip4_header_t));
- ip40->protocol = IP_PROTOCOL_GRE;
- ip40->checksum = ip4_header_checksum (ip40);
- }
- else
- {
- ip6_header_t *ip60;
- vlib_buffer_advance(p0, - sizeof(ip6_header_t) - sizeof(gre_header_t));
- ip60 = vlib_buffer_get_current(p0);
- gre0 = (gre_header_t *)(ip60 + 1);
- ip60->dst_address = lbm->ass[asindex0].address.ip6;
- ip60->src_address = lbm->ip6_src_address;
- ip60->hop_limit = 128;
- ip60->ip_version_traffic_class_and_flow_label = clib_host_to_net_u32 (0x6<<28);
- ip60->payload_length = clib_host_to_net_u16(len0 + sizeof(gre_header_t));
- ip60->protocol = IP_PROTOCOL_GRE;
- }
-
- gre0->flags_and_version = 0;
- gre0->protocol = (is_input_v4)?
- clib_host_to_net_u16(0x0800):
- clib_host_to_net_u16(0x86DD);
- }
+ if ( (encap_type == LB_ENCAP_TYPE_GRE4)
+ || (encap_type == LB_ENCAP_TYPE_GRE6) )
+ {
+ gre_header_t *gre0;
+ if (encap_type == LB_ENCAP_TYPE_GRE4) /* encap GRE4*/
+ {
+ ip4_header_t *ip40;
+ vlib_buffer_advance(p0, - sizeof(ip4_header_t) - sizeof(gre_header_t));
+ ip40 = vlib_buffer_get_current(p0);
+ gre0 = (gre_header_t *)(ip40 + 1);
+ ip40->src_address = lbm->ip4_src_address;
+ ip40->dst_address = lbm->ass[asindex0].address.ip4;
+ ip40->ip_version_and_header_length = 0x45;
+ ip40->ttl = 128;
+ ip40->fragment_id = 0;
+ ip40->flags_and_fragment_offset = 0;
+ ip40->length = clib_host_to_net_u16(len0 + sizeof(gre_header_t) + sizeof(ip4_header_t));
+ ip40->protocol = IP_PROTOCOL_GRE;
+ ip40->checksum = ip4_header_checksum (ip40);
+ }
+ else /* encap GRE6*/
+ {
+ ip6_header_t *ip60;
+ vlib_buffer_advance(p0, - sizeof(ip6_header_t) - sizeof(gre_header_t));
+ ip60 = vlib_buffer_get_current(p0);
+ gre0 = (gre_header_t *)(ip60 + 1);
+ ip60->dst_address = lbm->ass[asindex0].address.ip6;
+ ip60->src_address = lbm->ip6_src_address;
+ ip60->hop_limit = 128;
+ ip60->ip_version_traffic_class_and_flow_label = clib_host_to_net_u32 (0x6<<28);
+ ip60->payload_length = clib_host_to_net_u16(len0 + sizeof(gre_header_t));
+ ip60->protocol = IP_PROTOCOL_GRE;
+ }
+
+ gre0->flags_and_version = 0;
+ gre0->protocol = (is_input_v4)?
+ clib_host_to_net_u16(0x0800):
+ clib_host_to_net_u16(0x86DD);
+ } else if (encap_type == LB_ENCAP_TYPE_L3DSR) /* encap L3DSR*/
+ {
+ ip4_header_t *ip40;
+
+ ip40 = vlib_buffer_get_current(p0);
+ ip40->dst_address = lbm->ass[asindex0].address.ip4;
+ /* Get and rewrite DSCP bit */
+ ip40->tos = (u8)((vip0->dscp & 0x3F)<<2);
+ ip40->checksum = ip4_header_checksum (ip40);
+ }
if (PREDICT_FALSE (p0->flags & VLIB_BUFFER_IS_TRACED))
{
@@ -327,28 +338,35 @@ static uword
lb6_gre6_node_fn (vlib_main_t * vm,
vlib_node_runtime_t * node, vlib_frame_t * frame)
{
- return lb_node_fn(vm, node, frame, 0, 0);
+ return lb_node_fn(vm, node, frame, 0, LB_ENCAP_TYPE_GRE6);
}
static uword
lb6_gre4_node_fn (vlib_main_t * vm,
vlib_node_runtime_t * node, vlib_frame_t * frame)
{
- return lb_node_fn(vm, node, frame, 0, 1);
+ return lb_node_fn(vm, node, frame, 0, LB_ENCAP_TYPE_GRE4);
}
static uword
lb4_gre6_node_fn (vlib_main_t * vm,
vlib_node_runtime_t * node, vlib_frame_t * frame)
{
- return lb_node_fn(vm, node, frame, 1, 0);
+ return lb_node_fn(vm, node, frame, 1, LB_ENCAP_TYPE_GRE6);
}
static uword
lb4_gre4_node_fn (vlib_main_t * vm,
vlib_node_runtime_t * node, vlib_frame_t * frame)
{
- return lb_node_fn(vm, node, frame, 1, 1);
+ return lb_node_fn(vm, node, frame, 1, LB_ENCAP_TYPE_GRE4);
+}
+
+static uword
+lb4_l3dsr_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node, vlib_frame_t * frame)
+{
+ return lb_node_fn(vm, node, frame, 1, LB_ENCAP_TYPE_L3DSR);
}
VLIB_REGISTER_NODE (lb6_gre6_node) =
@@ -419,3 +437,19 @@ VLIB_REGISTER_NODE (lb4_gre4_node) =
},
};
+VLIB_REGISTER_NODE (lb4_l3dsr_node) =
+{
+ .function = lb4_l3dsr_node_fn,
+ .name = "lb4-l3dsr",
+ .vector_size = sizeof (u32),
+ .format_trace = format_lb_trace,
+
+ .n_errors = LB_N_ERROR,
+ .error_strings = lb_error_strings,
+
+ .n_next_nodes = LB_N_NEXT,
+ .next_nodes =
+ {
+ [LB_NEXT_DROP] = "error-drop"
+ },
+};
diff --git a/test/test_lb.py b/test/test_lb.py
index e653b60b0ab..731790bce72 100644
--- a/test/test_lb.py
+++ b/test/test_lb.py
@@ -15,6 +15,7 @@ from util import ppp
- IP4 to GRE6 encap
- IP6 to GRE4 encap
- IP6 to GRE6 encap
+ - IP4 to L3DSR encap
As stated in comments below, GRE has issues with IPv6.
All test cases involving IPv6 are executed, but
@@ -94,7 +95,7 @@ class TestLB(VppTestCase):
self.assertEqual(payload_info.src, self.pg0.sw_if_index)
self.assertEqual(str(inner), str(self.info.data[IPver]))
- def checkCapture(self, gre4, isv4):
+ def checkCapture(self, encap, isv4):
self.pg0.assert_nothing_captured()
out = self.pg1.get_capture(len(self.packets))
@@ -104,7 +105,7 @@ class TestLB(VppTestCase):
try:
asid = 0
gre = None
- if gre4:
+ if (encap == 'gre4'):
ip = p[IP]
asid = int(ip.dst.split(".")[3])
self.assertEqual(ip.version, 4)
@@ -115,7 +116,8 @@ class TestLB(VppTestCase):
self.assertEqual(len(ip.options), 0)
self.assertGreaterEqual(ip.ttl, 64)
gre = p[GRE]
- else:
+ self.checkInner(gre, isv4)
+ elif (encap == 'gre6'):
ip = p[IPv6]
asid = ip.dst.split(":")
asid = asid[len(asid) - 1]
@@ -132,7 +134,15 @@ class TestLB(VppTestCase):
self.assertGreaterEqual(ip.hlim, 64)
# self.assertEqual(len(ip.options), 0)
gre = GRE(str(p[IPv6].payload))
- self.checkInner(gre, isv4)
+ self.checkInner(gre, isv4)
+ if (encap == 'l3dsr'):
+ ip = p[IP]
+ asid = int(ip.dst.split(".")[3])
+ self.assertEqual(ip.version, 4)
+ self.assertEqual(ip.flags, 0)
+ self.assertEqual(ip.dst, "10.0.0.%u" % asid)
+ self.assertEqual(ip.tos, 0x1c)
+ self.assertEqual(len(ip.options), 0)
load[asid] += 1
except:
self.logger.error(ppp("Unexpected or invalid packet:", p))
@@ -156,7 +166,7 @@ class TestLB(VppTestCase):
self.pg0.add_stream(self.generatePackets(self.pg0, isv4=True))
self.pg_enable_capture(self.pg_interfaces)
self.pg_start()
- self.checkCapture(gre4=True, isv4=True)
+ self.checkCapture(encap='gre4', isv4=True)
finally:
for asid in self.ass:
@@ -176,7 +186,7 @@ class TestLB(VppTestCase):
self.pg_enable_capture(self.pg_interfaces)
self.pg_start()
- self.checkCapture(gre4=True, isv4=False)
+ self.checkCapture(encap='gre4', isv4=False)
finally:
for asid in self.ass:
self.vapi.cli("lb as 2001::/16 10.0.0.%u del" % (asid))
@@ -194,7 +204,7 @@ class TestLB(VppTestCase):
self.pg_enable_capture(self.pg_interfaces)
self.pg_start()
- self.checkCapture(gre4=False, isv4=True)
+ self.checkCapture(encap='gre6', isv4=True)
finally:
for asid in self.ass:
self.vapi.cli("lb as 90.0.0.0/8 2002::%u del" % (asid))
@@ -212,9 +222,27 @@ class TestLB(VppTestCase):
self.pg_enable_capture(self.pg_interfaces)
self.pg_start()
- self.checkCapture(gre4=False, isv4=False)
+ self.checkCapture(encap='gre6', isv4=False)
finally:
for asid in self.ass:
self.vapi.cli("lb as 2001::/16 2002::%u del" % (asid))
self.vapi.cli("lb vip 2001::/16 encap gre6 del")
self.vapi.cli("test lb flowtable flush")
+
+ def test_lb_ip4_l3dsr(self):
+ """ Load Balancer IP4 L3DSR """
+ try:
+ self.vapi.cli("lb vip 90.0.0.0/8 encap l3dsr dscp 7")
+ for asid in self.ass:
+ self.vapi.cli("lb as 90.0.0.0/8 10.0.0.%u" % (asid))
+
+ self.pg0.add_stream(self.generatePackets(self.pg0, isv4=True))
+ self.pg_enable_capture(self.pg_interfaces)
+ self.pg_start()
+ self.checkCapture(encap='l3dsr', isv4=True)
+
+ finally:
+ for asid in self.ass:
+ self.vapi.cli("lb as 90.0.0.0/8 10.0.0.%u del" % (asid))
+ self.vapi.cli("lb vip 90.0.0.0/8 encap l3dsr dscp 7 del")
+ self.vapi.cli("test lb flowtable flush")