diff options
author | Steven Luong <sluong@cisco.com> | 2019-08-20 16:58:00 -0700 |
---|---|---|
committer | Damjan Marion <dmarion@me.com> | 2019-09-06 16:07:59 +0000 |
commit | a1876b84e5598fcfad1debe5abb51d152e06a66e (patch) | |
tree | 58e7d58a52b8b0beb85dc99c6071dab4a17f32e1 /src/vnet | |
parent | ffbfe3a2d6aaf4e847a1848c29fc8ce2997ed260 (diff) |
bonding: add weight support for active-backup mode
Not all interfaces have the same characteristics within the bonding group.
For active-backup mode, we should do our best to select the slave that
performs the best as the primary slave. We already did that by preferring
the slave that is local numa. Sometimes, this is not enough. For example,
when all are local numas, the selection is arbitrary. Some slave interfaces
may have higher speed or better qos than the others. But this is hard to
infer.
One rule does not fit all. So we let the operator to optionally specify the
weight for each slave interface. Our primary slave selection rule is now
1. biggest weight
2. is local numa
3. current primary slave (to avoid churn)
4. lowest sw_if_index (for deterministic behavior)
This selection rule only applies to active-backup mode which only one slave
is used for forwarding traffic until it becomes unreachable. At that time,
the next "best" slave candidate is automatically promoted. The slaves are
sorted according to the preference rule when they are up. So there is no need
to find the next best candidate when the primary slave goes down.
Another good thing about this rule is when the down slave comes back up, it
is selected as the primary slave again unless there is indeed a "better"
slave than this down slave that were added during that period.
To set the weight for the slave interface, do this after the interface is
enslaved
set interface bond <interface-name> weight <value>
Type: feature
Signed-off-by: Steven Luong <sluong@cisco.com>
Change-Id: I59ced6d20ce1dec532e667dbe1afd1b4243e04f9
Diffstat (limited to 'src/vnet')
-rw-r--r-- | src/vnet/bonding/bond.api | 20 | ||||
-rw-r--r-- | src/vnet/bonding/bond_api.c | 22 | ||||
-rw-r--r-- | src/vnet/bonding/cli.c | 274 | ||||
-rw-r--r-- | src/vnet/bonding/node.h | 24 |
4 files changed, 258 insertions, 82 deletions
diff --git a/src/vnet/bonding/bond.api b/src/vnet/bonding/bond.api index e699267ccb5..5d9a0563c1a 100644 --- a/src/vnet/bonding/bond.api +++ b/src/vnet/bonding/bond.api @@ -19,7 +19,7 @@ the bonding device driver */ -option version = "1.0.1"; +option version = "1.0.2"; /** \brief Initialize a new bond interface with the given paramters @param client_index - opaque cookie to identify the sender @@ -154,6 +154,8 @@ define sw_interface_slave_dump @param interface_name - name of interface @param is_passve - interface does not initiate the lacp protocol, remote must be active speaker @param is_long_timeout - 90 seconds vs default 3 seconds neighbor timeout + @param is_local_numa - the slave interface is local numa + @param weight - the weight for the slave interface (active-backup mode only) */ define sw_interface_slave_details { @@ -162,6 +164,22 @@ define sw_interface_slave_details u8 interface_name[64]; u8 is_passive; u8 is_long_timeout; + u8 is_local_numa; + u32 weight; +}; + +/** \brief Interface set bond weight + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param sw_if_index - slave interface for which to set the weight + @param weight - weight value to be set for the slave interface +*/ +autoreply define sw_interface_set_bond_weight +{ + u32 client_index; + u32 context; + u32 sw_if_index; + u32 weight; }; /* diff --git a/src/vnet/bonding/bond_api.c b/src/vnet/bonding/bond_api.c index 8e1842367e5..74334b52bf2 100644 --- a/src/vnet/bonding/bond_api.c +++ b/src/vnet/bonding/bond_api.c @@ -47,6 +47,7 @@ _(BOND_CREATE, bond_create) \ _(BOND_DELETE, bond_delete) \ _(BOND_ENSLAVE, bond_enslave) \ +_(SW_INTERFACE_SET_BOND_WEIGHT, sw_interface_set_bond_weight) \ _(BOND_DETACH_SLAVE, bond_detach_slave) \ _(SW_INTERFACE_BOND_DUMP, sw_interface_bond_dump)\ _(SW_INTERFACE_SLAVE_DUMP, sw_interface_slave_dump) @@ -117,6 +118,25 @@ vl_api_bond_enslave_t_handler (vl_api_bond_enslave_t * mp) } static void + vl_api_sw_interface_set_bond_weight_t_handler + (vl_api_sw_interface_set_bond_weight_t * mp) +{ + vlib_main_t *vm = vlib_get_main (); + bond_set_intf_weight_args_t _a, *ap = &_a; + vl_api_sw_interface_set_bond_weight_reply_t *rmp; + int rv = 0; + + clib_memset (ap, 0, sizeof (*ap)); + + ap->sw_if_index = ntohl (mp->sw_if_index); + ap->weight = ntohl (mp->weight); + + bond_set_intf_weight (vm, ap); + + REPLY_MACRO (VL_API_SW_INTERFACE_SET_BOND_WEIGHT_REPLY); +} + +static void vl_api_bond_detach_slave_t_handler (vl_api_bond_detach_slave_t * mp) { vlib_main_t *vm = vlib_get_main (); @@ -200,6 +220,8 @@ bond_send_sw_interface_slave_details (vpe_api_main_t * am, strlen ((const char *) slave_if->interface_name))); mp->is_passive = slave_if->is_passive; mp->is_long_timeout = slave_if->is_long_timeout; + mp->is_local_numa = slave_if->is_local_numa; + mp->weight = htonl (slave_if->weight); mp->context = context; vl_api_send_msg (reg, (u8 *) mp); diff --git a/src/vnet/bonding/cli.c b/src/vnet/bonding/cli.c index 4e0d30aa598..2acc670a33d 100644 --- a/src/vnet/bonding/cli.c +++ b/src/vnet/bonding/cli.c @@ -29,8 +29,6 @@ bond_disable_collecting_distributing (vlib_main_t * vm, slave_if_t * sif) bond_if_t *bif; int i; uword p; - vnet_main_t *vnm = vnet_get_main (); - vnet_hw_interface_t *hw; u8 switching_active = 0; bif = bond_get_master_by_dev_instance (sif->bif_dev_instance); @@ -40,12 +38,10 @@ bond_disable_collecting_distributing (vlib_main_t * vm, slave_if_t * sif) p = *vec_elt_at_index (bif->active_slaves, i); if (p == sif->sw_if_index) { - if (sif->sw_if_index == bif->sw_if_index_working) - { - switching_active = 1; - if (bif->mode == BOND_MODE_ACTIVE_BACKUP) - bif->is_local_numa = 0; - } + if ((bif->mode == BOND_MODE_ACTIVE_BACKUP) && (i == 0) && + (vec_len (bif->active_slaves) > 1)) + /* deleting the active slave for active-backup */ + switching_active = 1; vec_del1 (bif->active_slaves, i); hash_unset (bif->active_slave_by_sw_if_index, sif->sw_if_index); if (sif->lacp_enabled && bif->numa_only) @@ -64,37 +60,9 @@ bond_disable_collecting_distributing (vlib_main_t * vm, slave_if_t * sif) } /* We get a new slave just becoming active */ - if ((bif->mode == BOND_MODE_ACTIVE_BACKUP) && switching_active) - { - if ((vec_len (bif->active_slaves) >= 1)) - { - /* scan all slaves and try to find the first slave with local numa node. */ - vec_foreach_index (i, bif->active_slaves) - { - p = *vec_elt_at_index (bif->active_slaves, i); - hw = vnet_get_sup_hw_interface (vnm, p); - if (vm->numa_node == hw->numa_node) - { - bif->sw_if_index_working = p; - bif->is_local_numa = 1; - vlib_process_signal_event (bm->vlib_main, - bond_process_node.index, - BOND_SEND_GARP_NA, - bif->hw_if_index); - break; - } - } - } - - /* No local numa node is found in the active slave set. Use the first slave */ - if ((bif->is_local_numa == 0) && (vec_len (bif->active_slaves) >= 1)) - { - p = *vec_elt_at_index (bif->active_slaves, 0); - bif->sw_if_index_working = p; - vlib_process_signal_event (bm->vlib_main, bond_process_node.index, - BOND_SEND_GARP_NA, bif->hw_if_index); - } - } + if (switching_active) + vlib_process_signal_event (bm->vlib_main, bond_process_node.index, + BOND_SEND_GARP_NA, bif->hw_if_index); clib_spinlock_unlock_if_init (&bif->lockp); if (bif->mode == BOND_MODE_LACP) @@ -102,6 +70,71 @@ bond_disable_collecting_distributing (vlib_main_t * vm, slave_if_t * sif) [sif->sw_if_index], sif->actor.state); } +/* + * return 1 if s2 is preferred. + * return -1 if s1 is preferred. + */ +static int +bond_slave_sort (void *a1, void *a2) +{ + u32 *s1 = a1; + u32 *s2 = a2; + slave_if_t *sif1 = bond_get_slave_by_sw_if_index (*s1); + slave_if_t *sif2 = bond_get_slave_by_sw_if_index (*s2); + bond_if_t *bif; + + ASSERT (sif1); + ASSERT (sif2); + /* + * sort entries according to preference rules: + * 1. biggest weight + * 2. numa-node + * 3. current active slave (to prevent churning) + * 4. lowest sw_if_index (for deterministic behavior) + * + */ + if (sif2->weight > sif1->weight) + return 1; + if (sif2->weight < sif1->weight) + return -1; + else + { + if (sif2->is_local_numa > sif1->is_local_numa) + return 1; + if (sif2->is_local_numa < sif1->is_local_numa) + return -1; + else + { + bif = bond_get_master_by_dev_instance (sif1->bif_dev_instance); + /* Favor the current active slave to avoid churning */ + if (bif->active_slaves[0] == sif2->sw_if_index) + return 1; + if (bif->active_slaves[0] == sif1->sw_if_index) + return -1; + /* go for the tiebreaker as the last resort */ + if (sif1->sw_if_index > sif2->sw_if_index) + return 1; + if (sif1->sw_if_index < sif2->sw_if_index) + return -1; + else + ASSERT (0); + } + } + return 0; +} + +static void +bond_sort_slaves (bond_if_t * bif) +{ + bond_main_t *bm = &bond_main; + u32 old_active = bif->active_slaves[0]; + + vec_sort_with_function (bif->active_slaves, bond_slave_sort); + if (old_active != bif->active_slaves[0]) + vlib_process_signal_event (bm->vlib_main, bond_process_node.index, + BOND_SEND_GARP_NA, bif->hw_if_index); +} + void bond_enable_collecting_distributing (vlib_main_t * vm, slave_if_t * sif) { @@ -109,8 +142,6 @@ bond_enable_collecting_distributing (vlib_main_t * vm, slave_if_t * sif) bond_main_t *bm = &bond_main; vnet_main_t *vnm = vnet_get_main (); vnet_hw_interface_t *hw = vnet_get_sup_hw_interface (vnm, sif->sw_if_index); - int i; - uword p; bif = bond_get_master_by_dev_instance (sif->bif_dev_instance); clib_spinlock_lock_if_init (&bif->lockp); @@ -127,43 +158,17 @@ bond_enable_collecting_distributing (vlib_main_t * vm, slave_if_t * sif) bif->n_numa_slaves++; } else - { - vec_add1 (bif->active_slaves, sif->sw_if_index); - } + vec_add1 (bif->active_slaves, sif->sw_if_index); - /* First slave becomes active? */ - if ((vec_len (bif->active_slaves) == 1) && - (bif->mode == BOND_MODE_ACTIVE_BACKUP)) + sif->is_local_numa = (vm->numa_node == hw->numa_node) ? 1 : 0; + if (bif->mode == BOND_MODE_ACTIVE_BACKUP) { - bif->sw_if_index_working = sif->sw_if_index; - bif->is_local_numa = (vm->numa_node == hw->numa_node) ? 1 : 0; - vlib_process_signal_event (bm->vlib_main, bond_process_node.index, - BOND_SEND_GARP_NA, bif->hw_if_index); - } - else if ((vec_len (bif->active_slaves) > 1) - && (bif->mode == BOND_MODE_ACTIVE_BACKUP) - && bif->is_local_numa == 0) - { - if (vm->numa_node == hw->numa_node) - { - vec_foreach_index (i, bif->active_slaves) - { - p = *vec_elt_at_index (bif->active_slaves, 0); - if (p == sif->sw_if_index) - break; - - vec_del1 (bif->active_slaves, 0); - hash_unset (bif->active_slave_by_sw_if_index, p); - vec_add1 (bif->active_slaves, p); - hash_set (bif->active_slave_by_sw_if_index, p, p); - } - bif->sw_if_index_working = sif->sw_if_index; - bif->is_local_numa = 1; - vlib_process_signal_event (bm->vlib_main, - bond_process_node.index, - BOND_SEND_GARP_NA, bif->hw_if_index); - - } + if (vec_len (bif->active_slaves) == 1) + /* First slave becomes active? */ + vlib_process_signal_event (bm->vlib_main, bond_process_node.index, + BOND_SEND_GARP_NA, bif->hw_if_index); + else + bond_sort_slaves (bif); } } clib_spinlock_unlock_if_init (&bif->lockp); @@ -238,6 +243,8 @@ bond_dump_slave_ifs (slave_interface_details_t ** out_slaveifs, slaveif->sw_if_index = sif->sw_if_index; slaveif->is_passive = sif->is_passive; slaveif->is_long_timeout = sif->is_long_timeout; + slaveif->is_local_numa = sif->is_local_numa; + slaveif->weight = sif->weight; } } *out_slaveifs = r_slaveifs; @@ -862,6 +869,14 @@ show_bond_details (vlib_main_t * vm) { vlib_cli_output (vm, " %U", format_vnet_sw_if_index_name, vnet_get_main (), *sw_if_index); + if (bif->mode == BOND_MODE_ACTIVE_BACKUP) + { + slave_if_t *sif = bond_get_slave_by_sw_if_index (*sw_if_index); + if (sif) + vlib_cli_output (vm, " weight: %u, is_local_numa: %u, " + "sw_if_index: %u", sif->weight, + sif->is_local_numa, sif->sw_if_index); + } } vlib_cli_output (vm, " number of slaves: %d", vec_len (bif->slaves)); vec_foreach (sw_if_index, bif->slaves) @@ -910,6 +925,113 @@ VLIB_CLI_COMMAND (show_bond_command, static) = { }; /* *INDENT-ON* */ +void +bond_set_intf_weight (vlib_main_t * vm, bond_set_intf_weight_args_t * args) +{ + slave_if_t *sif; + bond_if_t *bif; + vnet_main_t *vnm; + u32 old_weight; + + sif = bond_get_slave_by_sw_if_index (args->sw_if_index); + if (!sif) + { + args->rv = VNET_API_ERROR_INVALID_INTERFACE; + args->error = clib_error_return (0, "Interface not enslaved"); + return; + } + bif = bond_get_master_by_dev_instance (sif->bif_dev_instance); + if (!bif) + { + args->rv = VNET_API_ERROR_INVALID_INTERFACE; + args->error = clib_error_return (0, "bond interface not found"); + return; + } + if (bif->mode != BOND_MODE_ACTIVE_BACKUP) + { + args->rv = VNET_API_ERROR_INVALID_ARGUMENT; + args->error = + clib_error_return (0, "Weight valid for active-backup only"); + return; + } + + old_weight = sif->weight; + sif->weight = args->weight; + vnm = vnet_get_main (); + /* + * No need to sort the list if the affected slave is not up (not in active + * slave set), active slave count is 1, or the current slave is already the + * primary slave and new weight > old weight. + */ + if (!vnet_sw_interface_is_up (vnm, sif->sw_if_index) || + (vec_len (bif->active_slaves) == 1) || + ((bif->active_slaves[0] == sif->sw_if_index) && + (sif->weight >= old_weight))) + return; + + bond_sort_slaves (bif); +} + +static clib_error_t * +bond_set_intf_cmd (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + bond_set_intf_weight_args_t args = { 0 }; + u32 sw_if_index = (u32) ~ 0; + unformat_input_t _line_input, *line_input = &_line_input; + vnet_main_t *vnm = vnet_get_main (); + u8 weight_enter = 0; + u32 weight = 0; + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return clib_error_return (0, "Missing required arguments."); + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "sw_if_index %d", &sw_if_index)) + ; + else if (unformat (line_input, "%U", unformat_vnet_sw_interface, vnm, + &sw_if_index)) + ; + else if (unformat (line_input, "weight %u", &weight)) + weight_enter = 1; + else + { + clib_error_return (0, "unknown input `%U'", format_unformat_error, + input); + break; + } + } + + unformat_free (line_input); + if (sw_if_index == (u32) ~ 0) + { + args.rv = VNET_API_ERROR_INVALID_INTERFACE; + clib_error_return (0, "Interface name is invalid!"); + } + if (weight_enter == 0) + { + args.rv = VNET_API_ERROR_INVALID_ARGUMENT; + clib_error_return (0, "weight missing"); + } + + args.sw_if_index = sw_if_index; + args.weight = weight; + bond_set_intf_weight (vm, &args); + + return args.error; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND(set_interface_bond_cmd, static) = { + .path = "set interface bond", + .short_help = "set interface bond <interface> | sw_if_index <idx>" + " weight <value>", + .function = bond_set_intf_cmd, +}; +/* *INDENT-ON* */ + clib_error_t * bond_cli_init (vlib_main_t * vm) { diff --git a/src/vnet/bonding/node.h b/src/vnet/bonding/node.h index 1ad19dec872..1479209369a 100644 --- a/src/vnet/bonding/node.h +++ b/src/vnet/bonding/node.h @@ -110,6 +110,15 @@ typedef struct clib_error_t *error; } bond_detach_slave_args_t; +typedef struct +{ + u32 sw_if_index; + u32 weight; + /* return */ + int rv; + clib_error_t *error; +} bond_set_intf_weight_args_t; + /** BOND interface details struct */ typedef struct { @@ -130,6 +139,8 @@ typedef struct u8 interface_name[64]; u8 is_passive; u8 is_long_timeout; + u8 is_local_numa; + u32 weight; u32 active_slaves; } slave_interface_details_t; @@ -159,11 +170,6 @@ typedef struct u8 mode; u8 lb; - /* This flag works for active-backup mode only - and marks if the working port is local numa. */ - u8 is_local_numa; - /* current working sw_if_index in active-bakeup mode. */ - u32 sw_if_index_working; /* the last slave index for the rr lb */ u32 lb_rr_last_index; @@ -239,6 +245,9 @@ typedef struct /* neighbor vlib hw_if_index */ u32 hw_if_index; + /* weight -- valid only for active backup */ + u32 weight; + /* actor does not initiate the protocol exchange */ u8 is_passive; @@ -336,6 +345,9 @@ typedef struct /* pdu sent */ u64 marker_pdu_sent; + + /* slave is numa node */ + u8 is_local_numa; } slave_if_t; typedef void (*lacp_enable_disable_func) (vlib_main_t * vm, bond_if_t * bif, @@ -398,6 +410,8 @@ void bond_disable_collecting_distributing (vlib_main_t * vm, void bond_enable_collecting_distributing (vlib_main_t * vm, slave_if_t * sif); u8 *format_bond_interface_name (u8 * s, va_list * args); +void bond_set_intf_weight (vlib_main_t * vm, + bond_set_intf_weight_args_t * args); void bond_create_if (vlib_main_t * vm, bond_create_if_args_t * args); int bond_delete_if (vlib_main_t * vm, u32 sw_if_index); void bond_enslave (vlib_main_t * vm, bond_enslave_args_t * args); |