diff options
Diffstat (limited to 'src/vnet/bonding')
-rw-r--r-- | src/vnet/bonding/bond.api | 163 | ||||
-rw-r--r-- | src/vnet/bonding/bond_api.c | 328 | ||||
-rw-r--r-- | src/vnet/bonding/cli.c | 706 | ||||
-rw-r--r-- | src/vnet/bonding/device.c | 610 | ||||
-rw-r--r-- | src/vnet/bonding/node.c | 509 | ||||
-rw-r--r-- | src/vnet/bonding/node.h | 451 |
6 files changed, 2767 insertions, 0 deletions
diff --git a/src/vnet/bonding/bond.api b/src/vnet/bonding/bond.api new file mode 100644 index 00000000000..e8919e14904 --- /dev/null +++ b/src/vnet/bonding/bond.api @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** \file + + This file defines vpe control-plane API messages for + the bonding device driver +*/ + +option version = "1.0.0"; + +/** \brief Initialize a new bond interface with the given paramters + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param use_custom_mac - if set, mac_address is valid + @param mac_address - mac addr to assign to the interface if use_custom_mac is set + @param mode - mode, required (1=round-robin, 2=active-backup, 3=xor, 4=broadcastcast, 5=lacp) + @param lb - load balance, optional (0=l2, 1=l34, 2=l23) valid for xor and lacp modes. Otherwise ignored +*/ +define bond_create +{ + u32 client_index; + u32 context; + u8 use_custom_mac; + u8 mac_address[6]; + u8 mode; + u8 lb; +}; + +/** \brief Reply for bond create reply + @param context - returned sender context, to match reply w/ request + @param retval - return code + @param sw_if_index - software index allocated for the new tap interface +*/ +define bond_create_reply +{ + u32 context; + i32 retval; + u32 sw_if_index; +}; + +/** \brief Delete bond interface + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param sw_if_index - interface index of slave interface +*/ +autoreply define bond_delete +{ + u32 client_index; + u32 context; + u32 sw_if_index; +}; + +/** \brief Initialize a new bond interface with the given paramters + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param sw_if_index - slave sw_if_index + @param bond_sw_if_index - bond sw_if_index + @param is_passive - interface does not initiate the lacp protocol, remote must be active speaker + @param is_long_timeout - 90 seconds vs default 3 seconds neighbor timeout +*/ +define bond_enslave +{ + u32 client_index; + u32 context; + u32 sw_if_index; + u32 bond_sw_if_index; + u8 is_passive; + u8 is_long_timeout; +}; + +/** \brief Reply for bond enslave reply + @param context - returned sender context, to match reply w/ request + @param retval - return code +*/ +define bond_enslave_reply +{ + u32 context; + i32 retval; +}; + +/** \brief bond detach slave + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param sw_if_index - interface index of slave interface +*/ +autoreply define bond_detach_slave +{ + u32 client_index; + u32 context; + u32 sw_if_index; +}; + +/** \brief Dump bond interfaces request */ +define sw_interface_bond_dump +{ + u32 client_index; + u32 context; +}; + +/** \brief Reply for bond dump request + @param sw_if_index - software index of bond interface + @param interface_name - name of interface + @param mode - bonding mode + @param lb - load balance algo + @param active_slaves - active slaves count + @param slaves - config slave count +*/ +define sw_interface_bond_details +{ + u32 context; + u32 sw_if_index; + u8 interface_name[64]; + u8 mode; + u8 lb; + u32 active_slaves; + u32 slaves; +}; + +/** \brief bond slave dump + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param sw_if_index - interface index of bond interface +*/ +define sw_interface_slave_dump +{ + u32 client_index; + u32 context; + u32 sw_if_index; +}; + +/** \brief Reply for slave dump request + @param sw_if_index - software index of slave interface + @param interface_name - name of interface + @param is_passve - interface does not initiate the lacp protocol, remote must be active speaker + @param is_long_timeout - 90 seconds vs default 3 seconds neighbor timeout +*/ +define sw_interface_slave_details +{ + u32 context; + u32 sw_if_index; + u8 interface_name[64]; + u8 is_passive; + u8 is_long_timeout; +}; + +/* + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/bonding/bond_api.c b/src/vnet/bonding/bond_api.c new file mode 100644 index 00000000000..02536e966a1 --- /dev/null +++ b/src/vnet/bonding/bond_api.c @@ -0,0 +1,328 @@ +/* + *------------------------------------------------------------------ + * bond_api.c - vnet bonding device driver API support + * + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#include <vnet/vnet.h> +#include <vlibmemory/api.h> + +#include <vnet/interface.h> +#include <vnet/api_errno.h> +#include <vnet/ethernet/ethernet.h> + +#include <vnet/vnet_msg_enum.h> + +#define vl_typedefs /* define message structures */ +#include <vnet/vnet_all_api_h.h> +#undef vl_typedefs + +#define vl_endianfun /* define message structures */ +#include <vnet/vnet_all_api_h.h> +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__) +#define vl_printfun +#include <vnet/vnet_all_api_h.h> +#undef vl_printfun + +#include <vlibapi/api_helper_macros.h> +#include <vnet/bonding/node.h> + +#define foreach_bond_api_msg \ +_(BOND_CREATE, bond_create) \ +_(BOND_DELETE, bond_delete) \ +_(BOND_ENSLAVE, bond_enslave) \ +_(BOND_DETACH_SLAVE, bond_detach_slave) \ +_(SW_INTERFACE_BOND_DUMP, sw_interface_bond_dump)\ +_(SW_INTERFACE_SLAVE_DUMP, sw_interface_slave_dump) + +static void +bond_send_sw_interface_event_deleted (vpe_api_main_t * am, + unix_shared_memory_queue_t * q, + u32 sw_if_index) +{ + vl_api_sw_interface_event_t *mp; + + mp = vl_msg_api_alloc (sizeof (*mp)); + memset (mp, 0, sizeof (*mp)); + mp->_vl_msg_id = ntohs (VL_API_SW_INTERFACE_EVENT); + mp->sw_if_index = ntohl (sw_if_index); + + mp->admin_up_down = 0; + mp->link_up_down = 0; + mp->deleted = 1; + vl_msg_api_send_shmem (q, (u8 *) & mp); +} + +static void +vl_api_bond_delete_t_handler (vl_api_bond_delete_t * mp) +{ + vlib_main_t *vm = vlib_get_main (); + int rv; + vpe_api_main_t *vam = &vpe_api_main; + vl_api_bond_delete_reply_t *rmp; + unix_shared_memory_queue_t *q; + u32 sw_if_index = ntohl (mp->sw_if_index); + + rv = bond_delete_if (vm, sw_if_index); + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (!q) + return; + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (VL_API_BOND_DELETE_REPLY); + rmp->context = mp->context; + rmp->retval = ntohl (rv); + + vl_msg_api_send_shmem (q, (u8 *) & rmp); + + if (!rv) + bond_send_sw_interface_event_deleted (vam, q, sw_if_index); +} + +static void +vl_api_bond_create_t_handler (vl_api_bond_create_t * mp) +{ + vlib_main_t *vm = vlib_get_main (); + vl_api_bond_create_reply_t *rmp; + unix_shared_memory_queue_t *q; + bond_create_if_args_t _a, *ap = &_a; + + memset (ap, 0, sizeof (*ap)); + + if (mp->use_custom_mac) + { + clib_memcpy (ap->hw_addr, mp->mac_address, 6); + ap->hw_addr_set = 1; + } + + ap->mode = mp->mode; + ap->lb = mp->lb; + bond_create_if (vm, ap); + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (!q) + return; + + if (ap->rv != 0) + return; + rmp = vl_msg_api_alloc (sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (VL_API_BOND_CREATE_REPLY); + rmp->context = mp->context; + rmp->retval = ntohl (ap->rv); + rmp->sw_if_index = ntohl (ap->sw_if_index); + + vl_msg_api_send_shmem (q, (u8 *) & rmp); +} + +static void +vl_api_bond_enslave_t_handler (vl_api_bond_enslave_t * mp) +{ + vlib_main_t *vm = vlib_get_main (); + vl_api_bond_enslave_reply_t *rmp; + unix_shared_memory_queue_t *q; + bond_enslave_args_t _a, *ap = &_a; + + memset (ap, 0, sizeof (*ap)); + + ap->group = ntohl (mp->bond_sw_if_index); + ap->slave = ntohl (mp->sw_if_index); + ap->is_passive = mp->is_passive; + ap->is_long_timeout = mp->is_long_timeout; + + bond_enslave (vm, ap); + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (!q) + return; + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (VL_API_BOND_ENSLAVE_REPLY); + rmp->context = mp->context; + rmp->retval = ntohl (ap->rv); + + vl_msg_api_send_shmem (q, (u8 *) & rmp); +} + +static void +vl_api_bond_detach_slave_t_handler (vl_api_bond_detach_slave_t * mp) +{ + vlib_main_t *vm = vlib_get_main (); + vl_api_bond_detach_slave_reply_t *rmp; + unix_shared_memory_queue_t *q; + bond_detach_slave_args_t _a, *ap = &_a; + + memset (ap, 0, sizeof (*ap)); + + ap->slave = ntohl (mp->sw_if_index); + bond_detach_slave (vm, ap); + + q = vl_api_client_index_to_input_queue (mp->client_index); + if (!q) + return; + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (VL_API_BOND_DETACH_SLAVE_REPLY); + rmp->context = mp->context; + rmp->retval = htonl (ap->rv); + + vl_msg_api_send_shmem (q, (u8 *) & rmp); +} + +static void +bond_send_sw_interface_details (vpe_api_main_t * am, + vl_api_registration_t * reg, + bond_interface_details_t * bond_if, + u32 context) +{ + vl_api_sw_interface_bond_details_t *mp; + + mp = vl_msg_api_alloc (sizeof (*mp)); + memset (mp, 0, sizeof (*mp)); + mp->_vl_msg_id = htons (VL_API_SW_INTERFACE_BOND_DETAILS); + mp->sw_if_index = htonl (bond_if->sw_if_index); + clib_memcpy (mp->interface_name, bond_if->interface_name, + MIN (ARRAY_LEN (mp->interface_name) - 1, + strlen ((const char *) bond_if->interface_name))); + mp->mode = bond_if->mode; + mp->lb = bond_if->lb; + mp->active_slaves = htonl (bond_if->active_slaves); + mp->slaves = htonl (bond_if->slaves); + + mp->context = context; + vl_api_send_msg (reg, (u8 *) mp); +} + +static void +vl_api_sw_interface_bond_dump_t_handler (vl_api_sw_interface_bond_dump_t * mp) +{ + int rv; + vpe_api_main_t *am = &vpe_api_main; + vl_api_registration_t *reg; + bond_interface_details_t *bondifs = NULL; + bond_interface_details_t *bond_if = NULL; + + reg = vl_api_client_index_to_registration (mp->client_index); + if (!reg) + return; + + rv = bond_dump_ifs (&bondifs); + if (rv) + return; + + vec_foreach (bond_if, bondifs) + { + bond_send_sw_interface_details (am, reg, bond_if, mp->context); + } + + vec_free (bondifs); +} + +static void +bond_send_sw_interface_slave_details (vpe_api_main_t * am, + vl_api_registration_t * reg, + slave_interface_details_t * slave_if, + u32 context) +{ + vl_api_sw_interface_slave_details_t *mp; + + mp = vl_msg_api_alloc (sizeof (*mp)); + memset (mp, 0, sizeof (*mp)); + mp->_vl_msg_id = htons (VL_API_SW_INTERFACE_SLAVE_DETAILS); + mp->sw_if_index = htonl (slave_if->sw_if_index); + clib_memcpy (mp->interface_name, slave_if->interface_name, + MIN (ARRAY_LEN (mp->interface_name) - 1, + strlen ((const char *) slave_if->interface_name))); + mp->is_passive = slave_if->is_passive; + mp->is_long_timeout = slave_if->is_long_timeout; + + mp->context = context; + vl_api_send_msg (reg, (u8 *) mp); +} + +static void +vl_api_sw_interface_slave_dump_t_handler (vl_api_sw_interface_slave_dump_t * + mp) +{ + int rv; + vpe_api_main_t *am = &vpe_api_main; + vl_api_registration_t *reg; + slave_interface_details_t *slaveifs = NULL; + slave_interface_details_t *slave_if = NULL; + + reg = vl_api_client_index_to_registration (mp->client_index); + if (!reg) + return; + + rv = bond_dump_slave_ifs (&slaveifs, ntohl (mp->sw_if_index)); + if (rv) + return; + + vec_foreach (slave_if, slaveifs) + { + bond_send_sw_interface_slave_details (am, reg, slave_if, mp->context); + } + + vec_free (slaveifs); +} + +#define vl_msg_name_crc_list +#include <vnet/vnet_all_api_h.h> +#undef vl_msg_name_crc_list + +static void +bond_setup_message_id_table (api_main_t * am) +{ +#define _(id,n,crc) vl_msg_api_add_msg_name_crc (am, #n "_" #crc, id); + foreach_vl_msg_name_crc_bond; +#undef _ +} + +static clib_error_t * +bond_api_hookup (vlib_main_t * vm) +{ + api_main_t *am = &api_main; + +#define _(N,n) \ + vl_msg_api_set_handlers(VL_API_##N, #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_bond_api_msg; +#undef _ + + /* + * Set up the (msg_name, crc, message-id) table + */ + bond_setup_message_id_table (am); + + return 0; +} + +VLIB_API_INIT_FUNCTION (bond_api_hookup); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/bonding/cli.c b/src/vnet/bonding/cli.c new file mode 100644 index 00000000000..b2d66f9f1c8 --- /dev/null +++ b/src/vnet/bonding/cli.c @@ -0,0 +1,706 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#include <stdint.h> +#include <vlib/vlib.h> +#include <vlib/unix/unix.h> +#include <vnet/ethernet/ethernet.h> +#include <vnet/bonding/node.h> + +void +bond_disable_collecting_distributing (vlib_main_t * vm, slave_if_t * sif) +{ + bond_if_t *bif; + int i; + uword p; + + bif = bond_get_master_by_dev_instance (sif->bif_dev_instance); + vec_foreach_index (i, bif->active_slaves) + { + p = *vec_elt_at_index (bif->active_slaves, i); + if (p == sif->sw_if_index) + { + vec_del1 (bif->active_slaves, i); + hash_unset (bif->active_slave_by_sw_if_index, sif->sw_if_index); + break; + } + } +} + +void +bond_enable_collecting_distributing (vlib_main_t * vm, slave_if_t * sif) +{ + bond_if_t *bif; + + bif = bond_get_master_by_dev_instance (sif->bif_dev_instance); + if (!hash_get (bif->active_slave_by_sw_if_index, sif->sw_if_index)) + { + hash_set (bif->active_slave_by_sw_if_index, sif->sw_if_index, + sif->sw_if_index); + vec_add1 (bif->active_slaves, sif->sw_if_index); + } +} + +int +bond_dump_ifs (bond_interface_details_t ** out_bondifs) +{ + vnet_main_t *vnm = vnet_get_main (); + bond_main_t *bm = &bond_main; + bond_if_t *bif; + vnet_hw_interface_t *hi; + bond_interface_details_t *r_bondifs = NULL; + bond_interface_details_t *bondif = NULL; + + /* *INDENT-OFF* */ + pool_foreach (bif, bm->interfaces, + vec_add2(r_bondifs, bondif, 1); + memset (bondif, 0, sizeof (*bondif)); + bondif->sw_if_index = bif->sw_if_index; + hi = vnet_get_hw_interface (vnm, bif->hw_if_index); + clib_memcpy(bondif->interface_name, hi->name, + MIN (ARRAY_LEN (bondif->interface_name) - 1, + strlen ((const char *) hi->name))); + bondif->mode = bif->mode; + bondif->lb = bif->lb; + bondif->active_slaves = vec_len (bif->active_slaves); + bondif->slaves = vec_len (bif->slaves); + ); + /* *INDENT-ON* */ + + *out_bondifs = r_bondifs; + + return 0; +} + +int +bond_dump_slave_ifs (slave_interface_details_t ** out_slaveifs, + u32 bond_sw_if_index) +{ + vnet_main_t *vnm = vnet_get_main (); + bond_if_t *bif; + vnet_hw_interface_t *hi; + vnet_sw_interface_t *sw; + slave_interface_details_t *r_slaveifs = NULL; + slave_interface_details_t *slaveif = NULL; + u32 *sw_if_index = NULL; + slave_if_t *sif; + + bif = bond_get_master_by_sw_if_index (bond_sw_if_index); + if (!bif) + return 1; + + vec_foreach (sw_if_index, bif->slaves) + { + vec_add2 (r_slaveifs, slaveif, 1); + memset (slaveif, 0, sizeof (*slaveif)); + sif = bond_get_slave_by_sw_if_index (*sw_if_index); + if (sif) + { + sw = vnet_get_sw_interface (vnm, sif->sw_if_index); + hi = vnet_get_hw_interface (vnm, sw->hw_if_index); + clib_memcpy (slaveif->interface_name, hi->name, + MIN (ARRAY_LEN (slaveif->interface_name) - 1, + strlen ((const char *) hi->name))); + slaveif->sw_if_index = sif->sw_if_index; + slaveif->is_passive = sif->is_passive; + slaveif->is_long_timeout = sif->is_long_timeout; + } + } + *out_slaveifs = r_slaveifs; + + return 0; +} + +static void +bond_delete_neighbor (vlib_main_t * vm, bond_if_t * bif, slave_if_t * sif) +{ + bond_main_t *bm = &bond_main; + vnet_main_t *vnm = vnet_get_main (); + int i; + vnet_hw_interface_t *hw; + + bif->port_number_bitmap = + clib_bitmap_set (bif->port_number_bitmap, + ntohs (sif->actor_admin.port_number) - 1, 0); + hash_unset (bm->neighbor_by_sw_if_index, sif->sw_if_index); + vec_free (sif->last_marker_pkt); + vec_free (sif->last_rx_pkt); + vec_foreach_index (i, bif->slaves) + { + uword p = *vec_elt_at_index (bif->slaves, i); + if (p == sif->sw_if_index) + { + vec_del1 (bif->slaves, i); + break; + } + } + + bond_disable_collecting_distributing (vm, sif); + + /* Put back the old mac */ + hw = vnet_get_sup_hw_interface (vnm, sif->sw_if_index); + vnet_hw_interface_change_mac_address (vnm, hw->hw_if_index, + sif->persistent_hw_address); + + pool_put (bm->neighbors, sif); + + if ((bif->mode == BOND_MODE_LACP) && bm->lacp_enable_disable) + (*bm->lacp_enable_disable) (vm, bif, sif, 0); +} + +int +bond_delete_if (vlib_main_t * vm, u32 sw_if_index) +{ + bond_main_t *bm = &bond_main; + vnet_main_t *vnm = vnet_get_main (); + bond_if_t *bif; + slave_if_t *sif; + vnet_hw_interface_t *hw; + u32 *sif_sw_if_index; + + hw = vnet_get_sup_hw_interface (vnm, sw_if_index); + if (hw == NULL || bond_dev_class.index != hw->dev_class_index) + return VNET_API_ERROR_INVALID_SW_IF_INDEX; + + bif = bond_get_master_by_dev_instance (hw->dev_instance); + + vec_foreach (sif_sw_if_index, bif->slaves) + { + sif = bond_get_slave_by_sw_if_index (*sif_sw_if_index); + if (sif) + bond_delete_neighbor (vm, bif, sif); + } + + /* bring down the interface */ + vnet_hw_interface_set_flags (vnm, bif->hw_if_index, 0); + vnet_sw_interface_set_flags (vnm, bif->sw_if_index, 0); + + ethernet_delete_interface (vnm, bif->hw_if_index); + + clib_bitmap_free (bif->port_number_bitmap); + hash_unset (bm->bond_by_sw_if_index, bif->sw_if_index); + memset (bif, 0, sizeof (*bif)); + pool_put (bm->interfaces, bif); + + return 0; +} + +void +bond_create_if (vlib_main_t * vm, bond_create_if_args_t * args) +{ + bond_main_t *bm = &bond_main; + vnet_main_t *vnm = vnet_get_main (); + vnet_sw_interface_t *sw; + bond_if_t *bif; + + if ((args->mode == BOND_MODE_LACP) && bm->lacp_plugin_loaded == 0) + { + args->rv = VNET_API_ERROR_FEATURE_DISABLED; + args->error = clib_error_return (0, "LACP plugin is not loaded"); + return; + } + if (args->mode > BOND_MODE_LACP || args->mode < BOND_MODE_ROUND_ROBIN) + { + args->rv = VNET_API_ERROR_INVALID_ARGUMENT; + args->error = clib_error_return (0, "Invalid mode"); + return; + } + if (args->lb > BOND_LB_L23) + { + args->rv = VNET_API_ERROR_INVALID_ARGUMENT; + args->error = clib_error_return (0, "Invalid load-balance"); + return; + } + pool_get (bm->interfaces, bif); + memset (bif, 0, sizeof (*bif)); + bif->dev_instance = bif - bm->interfaces; + bif->lb = args->lb; + bif->mode = args->mode; + + // Special load-balance mode used for rr and bc + if (bif->mode == BOND_MODE_ROUND_ROBIN) + bif->lb = BOND_LB_RR; + else if (bif->mode == BOND_MODE_BROADCAST) + bif->lb = BOND_LB_BC; + + bif->use_custom_mac = args->hw_addr_set; + if (!args->hw_addr_set) + { + f64 now = vlib_time_now (vm); + u32 rnd; + rnd = (u32) (now * 1e6); + rnd = random_u32 (&rnd); + + memcpy (args->hw_addr + 2, &rnd, sizeof (rnd)); + args->hw_addr[0] = 2; + args->hw_addr[1] = 0xfe; + } + memcpy (bif->hw_address, args->hw_addr, 6); + args->error = ethernet_register_interface + (vnm, bond_dev_class.index, bif - bm->interfaces /* device instance */ , + bif->hw_address /* ethernet address */ , + &bif->hw_if_index, 0 /* flag change */ ); + + if (args->error) + { + args->rv = VNET_API_ERROR_INVALID_REGISTRATION; + pool_put (bm->interfaces, bif); + return; + } + + sw = vnet_get_hw_sw_interface (vnm, bif->hw_if_index); + bif->sw_if_index = sw->sw_if_index; + bif->group = bif->sw_if_index; + + vnet_hw_interface_set_flags (vnm, bif->hw_if_index, + VNET_HW_INTERFACE_FLAG_LINK_UP); + + hash_set (bm->bond_by_sw_if_index, bif->sw_if_index, bif->dev_instance); + + // for return + args->sw_if_index = bif->sw_if_index; +} + +static clib_error_t * +bond_create_command_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + bond_create_if_args_t args = { 0 }; + u8 mode_is_set = 0; + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return clib_error_return (0, "Missing required arguments."); + + args.mode = -1; + args.lb = BOND_LB_L2; + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "mode %U", unformat_bond_mode, &args.mode)) + mode_is_set = 1; + else if (((args.mode == BOND_MODE_LACP) || (args.mode == BOND_MODE_XOR)) + && unformat (line_input, "load-balance %U", + unformat_bond_load_balance, &args.lb)) + ; + else if (unformat (line_input, "hw-addr %U", + unformat_ethernet_address, args.hw_addr)) + args.hw_addr_set = 1; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + unformat_free (line_input); + + if (mode_is_set == 0) + return clib_error_return (0, "Missing bond mode"); + + bond_create_if (vm, &args); + + return args.error; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (bond_create_command, static) = { + .path = "create bond", + .short_help = "create bond mode {round-robin | active-backup | broadcast | " + "{lacp | xor} [load-balance { l2 | l23 | l34 }]} [hw-addr <mac-address>]", + .function = bond_create_command_fn, +}; +/* *INDENT-ON* */ + +static clib_error_t * +bond_delete_command_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + u32 sw_if_index = ~0; + vnet_main_t *vnm = vnet_get_main (); + int rv; + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return clib_error_return (0, "Missing <interface>"); + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "sw_if_index %d", &sw_if_index)) + ; + else if (unformat (line_input, "%U", unformat_vnet_sw_interface, + vnm, &sw_if_index)) + ; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + unformat_free (line_input); + + if (sw_if_index == ~0) + return clib_error_return (0, + "please specify interface name or sw_if_index"); + + rv = bond_delete_if (vm, sw_if_index); + if (rv == VNET_API_ERROR_INVALID_SW_IF_INDEX) + return clib_error_return (0, "not a bond interface"); + else if (rv != 0) + return clib_error_return (0, "error on deleting bond interface"); + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (bond_delete__command, static) = +{ + .path = "delete bond", + .short_help = "delete bond {<interface> | sw_if_index <sw_idx>}", + .function = bond_delete_command_fn, +}; +/* *INDENT-ON* */ + +void +bond_enslave (vlib_main_t * vm, bond_enslave_args_t * args) +{ + bond_main_t *bm = &bond_main; + vnet_main_t *vnm = vnet_get_main (); + bond_if_t *bif; + slave_if_t *sif; + vnet_interface_main_t *im = &vnm->interface_main; + vnet_hw_interface_t *hw, *hw2; + vnet_sw_interface_t *sw; + + bif = bond_get_master_by_sw_if_index (args->group); + if (!bif) + { + args->rv = VNET_API_ERROR_INVALID_INTERFACE; + args->error = clib_error_return (0, "bond interface not found"); + return; + } + // make sure the interface is not already enslaved + if (bond_get_slave_by_sw_if_index (args->slave)) + { + args->rv = VNET_API_ERROR_VALUE_EXIST; + args->error = clib_error_return (0, "interface was already enslaved"); + return; + } + hw = vnet_get_sup_hw_interface (vnm, args->slave); + if (hw->dev_class_index == bond_dev_class.index) + { + args->rv = VNET_API_ERROR_INVALID_INTERFACE; + args->error = + clib_error_return (0, "bond interface cannot be enslaved"); + return; + } + pool_get (bm->neighbors, sif); + memset (sif, 0, sizeof (*sif)); + clib_spinlock_init (&sif->lockp); + sw = pool_elt_at_index (im->sw_interfaces, args->slave); + sif->port_enabled = sw->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP; + sif->sw_if_index = sw->sw_if_index; + sif->hw_if_index = sw->hw_if_index; + sif->packet_template_index = (u8) ~ 0; + sif->is_passive = args->is_passive; + sif->group = args->group; + sif->bif_dev_instance = bif->dev_instance; + sif->mode = bif->mode; + + sif->is_long_timeout = args->is_long_timeout; + if (args->is_long_timeout) + sif->ttl_in_seconds = LACP_LONG_TIMOUT_TIME; + else + sif->ttl_in_seconds = LACP_SHORT_TIMOUT_TIME; + + hash_set (bm->neighbor_by_sw_if_index, sif->sw_if_index, + sif - bm->neighbors); + vec_add1 (bif->slaves, sif->sw_if_index); + + hw = vnet_get_sup_hw_interface (vnm, sif->sw_if_index); + /* Save the old mac */ + memcpy (sif->persistent_hw_address, hw->hw_address, 6); + if (bif->use_custom_mac) + { + vnet_hw_interface_change_mac_address (vnm, hw->hw_if_index, + bif->hw_address); + } + else + { + // bond interface gets the mac address from the first slave + if (vec_len (bif->slaves) == 1) + { + memcpy (bif->hw_address, hw->hw_address, 6); + hw2 = vnet_get_sup_hw_interface (vnm, bif->sw_if_index); + vnet_hw_interface_change_mac_address (vnm, hw2->hw_if_index, + hw->hw_address); + } + else + { + // subsequent slaves gets the mac address of the bond interface + vnet_hw_interface_change_mac_address (vnm, hw->hw_if_index, + bif->hw_address); + } + } + + if ((bif->mode == BOND_MODE_LACP) && bm->lacp_enable_disable) + { + (*bm->lacp_enable_disable) (vm, bif, sif, 1); + } + else + { + bond_enable_collecting_distributing (vm, sif); + } + + args->rv = vnet_feature_enable_disable ("device-input", "bond-input", + hw->hw_if_index, 1, 0, 0); + + if (args->rv) + { + args->error = + clib_error_return (0, + "Error encountered on input feature arc enable"); + } +} + +static clib_error_t * +enslave_interface_command_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + bond_enslave_args_t args = { 0 }; + unformat_input_t _line_input, *line_input = &_line_input; + vnet_main_t *vnm = vnet_get_main (); + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return clib_error_return (0, "Missing required arguments."); + + args.slave = ~0; + args.group = ~0; + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "interface %U", + unformat_vnet_sw_interface, vnm, &args.slave)) + ; + else if (unformat (line_input, "to %U", unformat_vnet_sw_interface, vnm, + &args.group)) + ; + else if (unformat (line_input, "passive")) + args.is_passive = 1; + else if (unformat (line_input, "long-timeout")) + args.is_long_timeout = 1; + else + { + args.error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + break; + } + } + unformat_free (line_input); + + if (args.error) + return args.error; + if (args.group == ~0) + return clib_error_return (0, "Missing bond interface"); + if (args.slave == ~0) + return clib_error_return (0, "please specify valid interface name"); + + bond_enslave (vm, &args); + + return args.error; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (enslave_interface_command, static) = { + .path = "enslave", + .short_help = "enslave interface <interface> to <BondEthernetx> [passive] [long-timeout]", + .function = enslave_interface_command_fn, +}; +/* *INDENT-ON* */ + +void +bond_detach_slave (vlib_main_t * vm, bond_detach_slave_args_t * args) +{ + bond_if_t *bif; + slave_if_t *sif; + + sif = bond_get_slave_by_sw_if_index (args->slave); + if (!sif) + { + args->rv = VNET_API_ERROR_INVALID_INTERFACE; + args->error = clib_error_return (0, "interface was not enslaved"); + return; + } + bif = bond_get_master_by_dev_instance (sif->bif_dev_instance); + bond_delete_neighbor (vm, bif, sif); +} + +static clib_error_t * +detach_interface_command_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + bond_detach_slave_args_t args = { 0 }; + unformat_input_t _line_input, *line_input = &_line_input; + vnet_main_t *vnm = vnet_get_main (); + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return clib_error_return (0, "Missing required arguments."); + + args.slave = ~0; + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "interface %U", + unformat_vnet_sw_interface, vnm, &args.slave)) + ; + else + { + args.error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + break; + } + } + unformat_free (line_input); + + if (args.error) + return args.error; + if (args.slave == ~0) + return clib_error_return (0, "please specify valid interface name"); + + bond_detach_slave (vm, &args); + + return args.error; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (detach_interface_command, static) = { + .path = "detach", + .short_help = "detach interface <interface>", + .function = detach_interface_command_fn, +}; +/* *INDENT-ON* */ + +static void +show_bond (vlib_main_t * vm) +{ + bond_main_t *bm = &bond_main; + bond_if_t *bif; + + vlib_cli_output (vm, "%-16s %-12s %-12s %-13s %-14s %s", + "interface name", "sw_if_index", "mode", + "load balance", "active slaves", "slaves"); + + /* *INDENT-OFF* */ + pool_foreach (bif, bm->interfaces, + ({ + vlib_cli_output (vm, "%-16U %-12d %-12U %-13U %-14u %u", + format_bond_interface_name, bif->dev_instance, + bif->sw_if_index, format_bond_mode, bif->mode, + format_bond_load_balance, bif->lb, + vec_len (bif->active_slaves), vec_len (bif->slaves)); + })); + /* *INDENT-ON* */ +} + +static void +show_bond_details (vlib_main_t * vm) +{ + bond_main_t *bm = &bond_main; + bond_if_t *bif; + u32 *sw_if_index; + + /* *INDENT-OFF* */ + pool_foreach (bif, bm->interfaces, + ({ + vlib_cli_output (vm, "%U", format_bond_interface_name, bif->dev_instance); + vlib_cli_output (vm, " mode: %U", + format_bond_mode, bif->mode); + vlib_cli_output (vm, " load balance: %U", + format_bond_load_balance, bif->lb); + if (bif->mode == BOND_MODE_ROUND_ROBIN) + vlib_cli_output (vm, " last xmit slave index: %u", + bif->lb_rr_last_index); + vlib_cli_output (vm, " number of active slaves: %d", + vec_len (bif->active_slaves)); + vec_foreach (sw_if_index, bif->active_slaves) + { + vlib_cli_output (vm, " %U", format_vnet_sw_if_index_name, + vnet_get_main (), *sw_if_index); + } + vlib_cli_output (vm, " number of slaves: %d", vec_len (bif->slaves)); + vec_foreach (sw_if_index, bif->slaves) + { + vlib_cli_output (vm, " %U", format_vnet_sw_if_index_name, + vnet_get_main (), *sw_if_index); + } + vlib_cli_output (vm, " device instance: %d", bif->dev_instance); + vlib_cli_output (vm, " sw_if_index: %d", bif->sw_if_index); + vlib_cli_output (vm, " hw_if_index: %d", bif->hw_if_index); + })); + /* *INDENT-ON* */ +} + +static clib_error_t * +show_bond_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + u8 details = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "details")) + details = 1; + else + { + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + } + + if (details) + show_bond_details (vm); + else + show_bond (vm); + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_bond_command, static) = { + .path = "show bond", + .short_help = "show bond [details]", + .function = show_bond_fn, +}; +/* *INDENT-ON* */ + +clib_error_t * +bond_cli_init (vlib_main_t * vm) +{ + bond_main_t *bm = &bond_main; + + bm->vlib_main = vm; + bm->vnet_main = vnet_get_main (); + bm->neighbor_by_sw_if_index = hash_create (0, sizeof (uword)); + + return 0; +} + +VLIB_INIT_FUNCTION (bond_cli_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/bonding/device.c b/src/vnet/bonding/device.c new file mode 100644 index 00000000000..8f9b3a95591 --- /dev/null +++ b/src/vnet/bonding/device.c @@ -0,0 +1,610 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#define _GNU_SOURCE +#include <stdint.h> +#include <vnet/ethernet/ethernet.h> +#include <vnet/ip/ip4_packet.h> +#include <vnet/ip/ip6_packet.h> +#include <vnet/ip/ip6_hop_by_hop_packet.h> +#include <vnet/bonding/node.h> + +#define foreach_bond_tx_error \ + _(NONE, "no error") \ + _(IF_DOWN, "interface down") \ + _(NO_SLAVE, "no slave") + +typedef enum +{ +#define _(f,s) BOND_TX_ERROR_##f, + foreach_bond_tx_error +#undef _ + BOND_TX_N_ERROR, +} bond_tx_error_t; + +static char *bond_tx_error_strings[] = { +#define _(n,s) s, + foreach_bond_tx_error +#undef _ +}; + +static u8 * +format_bond_tx_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + bond_packet_trace_t *t = va_arg (*args, bond_packet_trace_t *); + vnet_hw_interface_t *hw, *hw1; + vnet_main_t *vnm = vnet_get_main (); + + hw = vnet_get_sup_hw_interface (vnm, t->sw_if_index); + hw1 = vnet_get_sup_hw_interface (vnm, t->bond_sw_if_index); + s = format (s, "src %U, dst %U, %s -> %s", + format_ethernet_address, t->ethernet.src_address, + format_ethernet_address, t->ethernet.dst_address, + hw->name, hw1->name); + + return s; +} + +u8 * +format_bond_interface_name (u8 * s, va_list * args) +{ + u32 dev_instance = va_arg (*args, u32); + bond_main_t *bm = &bond_main; + bond_if_t *bif = pool_elt_at_index (bm->interfaces, dev_instance); + + s = format (s, "BondEthernet%lu", bif->dev_instance); + + return s; +} + +static __clib_unused clib_error_t * +bond_subif_add_del_function (vnet_main_t * vnm, u32 hw_if_index, + struct vnet_sw_interface_t *st, int is_add) +{ + /* Nothing for now */ + return 0; +} + +static clib_error_t * +bond_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags) +{ + vnet_hw_interface_t *hif = vnet_get_hw_interface (vnm, hw_if_index); + uword is_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0; + bond_main_t *bm = &bond_main; + bond_if_t *bif = pool_elt_at_index (bm->interfaces, hif->dev_instance); + + bif->admin_up = is_up; + if (is_up && vec_len (bif->active_slaves)) + vnet_hw_interface_set_flags (vnm, bif->hw_if_index, + VNET_HW_INTERFACE_FLAG_LINK_UP); + return 0; +} + +static inline u32 +bond_load_balance_broadcast (vlib_main_t * vm, vlib_node_runtime_t * node, + bond_if_t * bif, vlib_buffer_t * b0) +{ + vnet_main_t *vnm = vnet_get_main (); + vlib_buffer_t *c0; + int i; + u32 *to_next = 0; + u32 sw_if_index; + vlib_frame_t *f; + + + for (i = 1; i < vec_len (bif->active_slaves); i++) + { + sw_if_index = *vec_elt_at_index (bif->active_slaves, i); + f = vnet_get_frame_to_sw_interface (vnm, sw_if_index); + to_next = vlib_frame_vector_args (f); + to_next += f->n_vectors; + c0 = vlib_buffer_copy (vm, b0); + if (PREDICT_TRUE (c0 != 0)) + { + vnet_buffer (c0)->sw_if_index[VLIB_TX] = sw_if_index; + to_next[0] = vlib_get_buffer_index (vm, c0); + f->n_vectors++; + vnet_put_frame_to_sw_interface (vnm, sw_if_index, f); + } + } + + return 0; +} + +static inline u32 +bond_load_balance_l2 (vlib_main_t * vm, vlib_node_runtime_t * node, + bond_if_t * bif, vlib_buffer_t * b0) +{ + ethernet_header_t *eth = (ethernet_header_t *) vlib_buffer_get_current (b0); + u32 a = 0, b = 0, c = 0, t1, t2; + u16 t11, t22; + + memcpy (&t1, eth->src_address, sizeof (t1)); + memcpy (&t11, ð->src_address[4], sizeof (t11)); + a = t1 ^ t11; + + memcpy (&t2, eth->dst_address, sizeof (t2)); + memcpy (&t22, ð->dst_address[4], sizeof (t22)); + b = t2 ^ t22; + + hash_v3_mix32 (a, b, c); + hash_v3_finalize32 (a, b, c); + + return c % vec_len (bif->active_slaves); +} + +static inline u16 * +bond_locate_ethertype (ethernet_header_t * eth) +{ + u16 *ethertype_p; + ethernet_vlan_header_t *vlan; + + if (!ethernet_frame_is_tagged (clib_net_to_host_u16 (eth->type))) + { + ethertype_p = ð->type; + } + else + { + vlan = (void *) (eth + 1); + ethertype_p = &vlan->type; + if (*ethertype_p == ntohs (ETHERNET_TYPE_VLAN)) + { + vlan++; + ethertype_p = &vlan->type; + } + } + return ethertype_p; +} + +static inline u32 +bond_load_balance_l23 (vlib_main_t * vm, vlib_node_runtime_t * node, + bond_if_t * bif, vlib_buffer_t * b0) +{ + ethernet_header_t *eth = (ethernet_header_t *) vlib_buffer_get_current (b0); + u8 ip_version; + ip4_header_t *ip4; + u16 ethertype, *ethertype_p; + + ethertype_p = bond_locate_ethertype (eth); + ethertype = *ethertype_p; + + if ((ethertype != htons (ETHERNET_TYPE_IP4)) && + (ethertype != htons (ETHERNET_TYPE_IP6))) + return (bond_load_balance_l2 (vm, node, bif, b0)); + + ip4 = (ip4_header_t *) (ethertype_p + 1); + ip_version = (ip4->ip_version_and_header_length >> 4); + + if (ip_version == 0x4) + { + u16 t11, t22; + u32 a = 0, b = 0, c = 0, t1, t2; + + memcpy (&t1, eth->src_address, sizeof (t1)); + memcpy (&t11, ð->src_address[4], sizeof (t11)); + a = t1 ^ t11; + + memcpy (&t2, eth->dst_address, sizeof (t2)); + memcpy (&t22, ð->dst_address[4], sizeof (t22)); + b = t2 ^ t22; + + c = ip4->src_address.data_u32 ^ ip4->dst_address.data_u32; + + hash_v3_mix32 (a, b, c); + hash_v3_finalize32 (a, b, c); + + return c % vec_len (bif->active_slaves); + } + else if (ip_version == 0x6) + { + u64 a, b, c; + u64 t1 = 0, t2 = 0; + ip6_header_t *ip6 = (ip6_header_t *) (eth + 1); + + memcpy (&t1, eth->src_address, sizeof (eth->src_address)); + memcpy (&t2, eth->dst_address, sizeof (eth->dst_address)); + a = t1 ^ t2; + + b = (ip6->src_address.as_u64[0] ^ ip6->src_address.as_u64[1]); + c = (ip6->dst_address.as_u64[0] ^ ip6->dst_address.as_u64[1]); + + hash_mix64 (a, b, c); + return c % vec_len (bif->active_slaves); + } + return (bond_load_balance_l2 (vm, node, bif, b0)); +} + +static inline u32 +bond_load_balance_l34 (vlib_main_t * vm, vlib_node_runtime_t * node, + bond_if_t * bif, vlib_buffer_t * b0) +{ + ethernet_header_t *eth = (ethernet_header_t *) vlib_buffer_get_current (b0); + u8 ip_version; + uword is_tcp_udp = 0; + ip4_header_t *ip4; + u16 ethertype, *ethertype_p; + + ethertype_p = bond_locate_ethertype (eth); + ethertype = *ethertype_p; + + if ((ethertype != htons (ETHERNET_TYPE_IP4)) && + (ethertype != htons (ETHERNET_TYPE_IP6))) + return (bond_load_balance_l2 (vm, node, bif, b0)); + + ip4 = (ip4_header_t *) (ethertype_p + 1); + ip_version = (ip4->ip_version_and_header_length >> 4); + + if (ip_version == 0x4) + { + u32 a = 0, b = 0, c = 0, t1, t2; + tcp_header_t *tcp = (void *) (ip4 + 1); + is_tcp_udp = (ip4->protocol == IP_PROTOCOL_TCP) || + (ip4->protocol == IP_PROTOCOL_UDP); + + a = ip4->src_address.data_u32 ^ ip4->dst_address.data_u32; + + t1 = is_tcp_udp ? tcp->src : 0; + t2 = is_tcp_udp ? tcp->dst : 0; + b = t1 + (t2 << 16); + + hash_v3_mix32 (a, b, c); + hash_v3_finalize32 (a, b, c); + + return c % vec_len (bif->active_slaves); + } + else if (ip_version == 0x6) + { + u64 a, b, c; + u64 t1, t2; + ip6_header_t *ip6 = (ip6_header_t *) (eth + 1); + tcp_header_t *tcp = (void *) (ip6 + 1); + + if (PREDICT_TRUE ((ip6->protocol == IP_PROTOCOL_TCP) || + (ip6->protocol == IP_PROTOCOL_UDP))) + { + is_tcp_udp = 1; + tcp = (void *) (ip6 + 1); + } + else if (ip6->protocol == IP_PROTOCOL_IP6_HOP_BY_HOP_OPTIONS) + { + ip6_hop_by_hop_header_t *hbh = + (ip6_hop_by_hop_header_t *) (ip6 + 1); + if ((hbh->protocol == IP_PROTOCOL_TCP) + || (hbh->protocol == IP_PROTOCOL_UDP)) + { + is_tcp_udp = 1; + tcp = (tcp_header_t *) ((u8 *) hbh + ((hbh->length + 1) << 3)); + } + } + a = (ip6->src_address.as_u64[0] ^ ip6->src_address.as_u64[1]); + b = (ip6->dst_address.as_u64[0] ^ ip6->dst_address.as_u64[1]); + + t1 = is_tcp_udp ? tcp->src : 0; + t2 = is_tcp_udp ? tcp->dst : 0; + c = (t2 << 16) | t1; + hash_mix64 (a, b, c); + + return c % vec_len (bif->active_slaves); + } + + return (bond_load_balance_l2 (vm, node, bif, b0)); +} + +static inline u32 +bond_load_balance_round_robin (vlib_main_t * vm, + vlib_node_runtime_t * node, + bond_if_t * bif, vlib_buffer_t * b0) +{ + bif->lb_rr_last_index++; + bif->lb_rr_last_index %= vec_len (bif->active_slaves); + + return bif->lb_rr_last_index; +} + +static inline u32 +bond_load_balance_active_backup (vlib_main_t * vm, + vlib_node_runtime_t * node, + bond_if_t * bif, vlib_buffer_t * b0) +{ + /* First interface is the active, the rest is backup */ + return 0; +} + +static bond_load_balance_func_t bond_load_balance_table[] = { +#define _(v,f,s, p) { bond_load_balance_##p }, + foreach_bond_lb_algo +#undef _ +}; + +static uword +bond_tx_fn (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + vnet_interface_output_runtime_t *rund = (void *) node->runtime_data; + bond_main_t *bm = &bond_main; + bond_if_t *bif = pool_elt_at_index (bm->interfaces, rund->dev_instance); + u32 bi0, bi1, bi2, bi3; + vlib_buffer_t *b0, *b1, *b2, *b3; + u32 *from = vlib_frame_vector_args (frame); + u32 n_left_from; + ethernet_header_t *eth; + u32 next0 = 0, next1 = 0, next2 = 0, next3 = 0; + u32 port, port1, port2, port3; + u32 sw_if_index, sw_if_index1, sw_if_index2, sw_if_index3; + bond_packet_trace_t *t0; + uword n_trace = vlib_get_trace_count (vm, node); + u16 thread_index = vlib_get_thread_index (); + vnet_main_t *vnm = vnet_get_main (); + u32 *to_next, *to_next1, *to_next2, *to_next3; + u32 sif_if_index, sif_if_index1, sif_if_index2, sif_if_index3; + vlib_frame_t *f, *f1, *f2, *f3; + + if (PREDICT_FALSE (bif->admin_up == 0)) + { + vlib_buffer_free (vm, vlib_frame_args (frame), frame->n_vectors); + vlib_increment_simple_counter (vnet_main.interface_main.sw_if_counters + + VNET_INTERFACE_COUNTER_DROP, + thread_index, bif->sw_if_index, + frame->n_vectors); + vlib_error_count (vm, node->node_index, BOND_TX_ERROR_IF_DOWN, + frame->n_vectors); + return frame->n_vectors; + } + + if (PREDICT_FALSE (vec_len (bif->active_slaves) == 0)) + { + bi0 = from[0]; + b0 = vlib_get_buffer (vm, bi0); + vlib_increment_combined_counter + (vnet_main.interface_main.combined_sw_if_counters + + VNET_INTERFACE_COUNTER_TX, thread_index, bif->sw_if_index, + frame->n_vectors, b0->current_length); + + vlib_buffer_free (vm, vlib_frame_args (frame), frame->n_vectors); + vlib_increment_simple_counter (vnet_main.interface_main.sw_if_counters + + VNET_INTERFACE_COUNTER_DROP, + thread_index, bif->sw_if_index, + frame->n_vectors); + vlib_error_count (vm, node->node_index, BOND_TX_ERROR_NO_SLAVE, + frame->n_vectors); + return frame->n_vectors; + } + + /* Number of buffers / pkts */ + n_left_from = frame->n_vectors; + + while (n_left_from >= 8) + { + // Prefetch next iteration + { + vlib_buffer_t *p4, *p5, *p6, *p7; + + p4 = vlib_get_buffer (vm, from[4]); + p5 = vlib_get_buffer (vm, from[5]); + p6 = vlib_get_buffer (vm, from[6]); + p7 = vlib_get_buffer (vm, from[7]); + + vlib_prefetch_buffer_header (p4, STORE); + vlib_prefetch_buffer_header (p5, STORE); + vlib_prefetch_buffer_header (p6, STORE); + vlib_prefetch_buffer_header (p7, STORE); + + CLIB_PREFETCH (p4->data, CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (p5->data, CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (p6->data, CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (p7->data, CLIB_CACHE_LINE_BYTES, LOAD); + } + + bi0 = from[0]; + bi1 = from[1]; + bi2 = from[2]; + bi3 = from[3]; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + b2 = vlib_get_buffer (vm, bi2); + b3 = vlib_get_buffer (vm, bi3); + + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b1); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b2); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b3); + + sw_if_index = vnet_buffer (b0)->sw_if_index[VLIB_TX]; + sw_if_index1 = vnet_buffer (b1)->sw_if_index[VLIB_TX]; + sw_if_index2 = vnet_buffer (b2)->sw_if_index[VLIB_TX]; + sw_if_index3 = vnet_buffer (b3)->sw_if_index[VLIB_TX]; + + port = + (bond_load_balance_table[bif->lb]).load_balance (vm, node, bif, b0); + port1 = + (bond_load_balance_table[bif->lb]).load_balance (vm, node, bif, b1); + port2 = + (bond_load_balance_table[bif->lb]).load_balance (vm, node, bif, b2); + port3 = + (bond_load_balance_table[bif->lb]).load_balance (vm, node, bif, b3); + + sif_if_index = *vec_elt_at_index (bif->active_slaves, port); + sif_if_index1 = *vec_elt_at_index (bif->active_slaves, port1); + sif_if_index2 = *vec_elt_at_index (bif->active_slaves, port2); + sif_if_index3 = *vec_elt_at_index (bif->active_slaves, port3); + + vnet_buffer (b0)->sw_if_index[VLIB_TX] = sif_if_index; + vnet_buffer (b1)->sw_if_index[VLIB_TX] = sif_if_index1; + vnet_buffer (b2)->sw_if_index[VLIB_TX] = sif_if_index2; + vnet_buffer (b3)->sw_if_index[VLIB_TX] = sif_if_index3; + + f = vnet_get_frame_to_sw_interface (vnm, sif_if_index); + f1 = vnet_get_frame_to_sw_interface (vnm, sif_if_index1); + f2 = vnet_get_frame_to_sw_interface (vnm, sif_if_index2); + f3 = vnet_get_frame_to_sw_interface (vnm, sif_if_index3); + + to_next = vlib_frame_vector_args (f); + to_next1 = vlib_frame_vector_args (f1); + to_next2 = vlib_frame_vector_args (f2); + to_next3 = vlib_frame_vector_args (f3); + + to_next += f->n_vectors; + to_next1 += f1->n_vectors; + to_next2 += f2->n_vectors; + to_next3 += f3->n_vectors; + + to_next[0] = vlib_get_buffer_index (vm, b0); + to_next1[0] = vlib_get_buffer_index (vm, b1); + to_next2[0] = vlib_get_buffer_index (vm, b2); + to_next3[0] = vlib_get_buffer_index (vm, b3); + + f->n_vectors++; + f1->n_vectors++; + f2->n_vectors++; + f3->n_vectors++; + + vnet_put_frame_to_sw_interface (vnm, sif_if_index, f); + vnet_put_frame_to_sw_interface (vnm, sif_if_index1, f1); + vnet_put_frame_to_sw_interface (vnm, sif_if_index2, f2); + vnet_put_frame_to_sw_interface (vnm, sif_if_index3, f3); + + if (PREDICT_FALSE (n_trace > 0)) + { + vlib_trace_buffer (vm, node, next0, b0, 0 /* follow_chain */ ); + vlib_set_trace_count (vm, node, --n_trace); + t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); + eth = (ethernet_header_t *) vlib_buffer_get_current (b0); + t0->ethernet = *eth; + t0->sw_if_index = sw_if_index; + t0->bond_sw_if_index = vnet_buffer (b0)->sw_if_index[VLIB_TX]; + + if (PREDICT_TRUE (n_trace > 0)) + { + vlib_trace_buffer (vm, node, next1, b1, 0 /* follow_chain */ ); + vlib_set_trace_count (vm, node, --n_trace); + t0 = vlib_add_trace (vm, node, b1, sizeof (*t0)); + eth = (ethernet_header_t *) vlib_buffer_get_current (b1); + t0->ethernet = *eth; + t0->sw_if_index = sw_if_index1; + t0->bond_sw_if_index = vnet_buffer (b1)->sw_if_index[VLIB_TX]; + + if (PREDICT_TRUE (n_trace > 0)) + { + vlib_trace_buffer (vm, node, next2, b2, + 0 /* follow_chain */ ); + vlib_set_trace_count (vm, node, --n_trace); + t0 = vlib_add_trace (vm, node, b2, sizeof (*t0)); + eth = (ethernet_header_t *) vlib_buffer_get_current (b2); + t0->ethernet = *eth; + t0->sw_if_index = sw_if_index2; + t0->bond_sw_if_index = + vnet_buffer (b2)->sw_if_index[VLIB_TX]; + + if (PREDICT_TRUE (n_trace > 0)) + { + vlib_trace_buffer (vm, node, next3, b3, + 0 /* follow_chain */ ); + vlib_set_trace_count (vm, node, --n_trace); + t0 = vlib_add_trace (vm, node, b3, sizeof (*t0)); + eth = + (ethernet_header_t *) vlib_buffer_get_current (b3); + t0->ethernet = *eth; + t0->sw_if_index = sw_if_index3; + t0->bond_sw_if_index = + vnet_buffer (b3)->sw_if_index[VLIB_TX]; + } + } + } + } + + from += 4; + n_left_from -= 4; + } + + while (n_left_from > 0) + { + // Prefetch next iteration + if (n_left_from > 1) + { + vlib_buffer_t *p2; + + p2 = vlib_get_buffer (vm, from[1]); + vlib_prefetch_buffer_header (p2, STORE); + CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, LOAD); + } + + bi0 = from[0]; + b0 = vlib_get_buffer (vm, bi0); + + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); + + sw_if_index = vnet_buffer (b0)->sw_if_index[VLIB_TX]; + + port = + (bond_load_balance_table[bif->lb]).load_balance (vm, node, bif, b0); + sif_if_index = *vec_elt_at_index (bif->active_slaves, port); + vnet_buffer (b0)->sw_if_index[VLIB_TX] = sif_if_index; + f = vnet_get_frame_to_sw_interface (vnm, sif_if_index); + to_next = vlib_frame_vector_args (f); + to_next += f->n_vectors; + + to_next[0] = vlib_get_buffer_index (vm, b0); + f->n_vectors++; + vnet_put_frame_to_sw_interface (vnm, sif_if_index, f); + + if (PREDICT_FALSE (n_trace > 0)) + { + vlib_trace_buffer (vm, node, next0, b0, 0 /* follow_chain */ ); + vlib_set_trace_count (vm, node, --n_trace); + t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); + eth = (ethernet_header_t *) vlib_buffer_get_current (b0); + t0->ethernet = *eth; + t0->sw_if_index = sw_if_index; + t0->bond_sw_if_index = vnet_buffer (b0)->sw_if_index[VLIB_TX]; + } + + from += 1; + n_left_from -= 1; + } + + vlib_increment_simple_counter (vnet_main.interface_main.sw_if_counters + + VNET_INTERFACE_COUNTER_TX, thread_index, + bif->sw_if_index, frame->n_vectors); + + return frame->n_vectors; +} + +/* *INDENT-OFF* */ +VNET_DEVICE_CLASS (bond_dev_class) = { + .name = "bond", + .tx_function = bond_tx_fn, + .tx_function_n_errors = BOND_TX_N_ERROR, + .tx_function_error_strings = bond_tx_error_strings, + .format_device_name = format_bond_interface_name, + .admin_up_down_function = bond_interface_admin_up_down, + .subif_add_del_function = bond_subif_add_del_function, + .format_tx_trace = format_bond_tx_trace, +}; + +VLIB_DEVICE_TX_FUNCTION_MULTIARCH (bond_dev_class, bond_tx_fn) +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/bonding/node.c b/src/vnet/bonding/node.c new file mode 100644 index 00000000000..4deec829195 --- /dev/null +++ b/src/vnet/bonding/node.c @@ -0,0 +1,509 @@ +/* + *------------------------------------------------------------------ + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#define _GNU_SOURCE +#include <stdint.h> +#include <vnet/llc/llc.h> +#include <vnet/snap/snap.h> +#include <vnet/bonding/node.h> + +bond_main_t bond_main; + +#define foreach_bond_input_error \ + _(NONE, "no error") \ + _(IF_DOWN, "interface down") \ + _(NO_SLAVE, "no slave") \ + _(NO_BOND, "no bond interface")\ + _(PASS_THRU, "pass through") + +typedef enum +{ +#define _(f,s) BOND_INPUT_ERROR_##f, + foreach_bond_input_error +#undef _ + BOND_INPUT_N_ERROR, +} bond_input_error_t; + +static char *bond_input_error_strings[] = { +#define _(n,s) s, + foreach_bond_input_error +#undef _ +}; + +static u8 * +format_bond_input_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + bond_packet_trace_t *t = va_arg (*args, bond_packet_trace_t *); + vnet_hw_interface_t *hw, *hw1; + vnet_main_t *vnm = vnet_get_main (); + + hw = vnet_get_sup_hw_interface (vnm, t->sw_if_index); + hw1 = vnet_get_sup_hw_interface (vnm, t->bond_sw_if_index); + s = format (s, "src %U, dst %U, %s -> %s", + format_ethernet_address, t->ethernet.src_address, + format_ethernet_address, t->ethernet.dst_address, + hw->name, hw1->name); + + return s; +} + +static_always_inline u8 +packet_is_cdp (ethernet_header_t * eth) +{ + llc_header_t *llc; + snap_header_t *snap; + + llc = (llc_header_t *) (eth + 1); + snap = (snap_header_t *) (llc + 1); + + return ((eth->type == htons (ETHERNET_TYPE_CDP)) || + ((llc->src_sap == 0xAA) && (llc->control == 0x03) && + (snap->protocol == htons (0x2000)) && + (snap->oui[0] == 0) && (snap->oui[1] == 0) && + (snap->oui[2] == 0x0C))); +} + +static inline void +bond_sw_if_index_rewrite (vlib_main_t * vm, vlib_node_runtime_t * node, + slave_if_t * sif, ethernet_header_t * eth, + vlib_buffer_t * b0) +{ + bond_if_t *bif; + u16 thread_index = vlib_get_thread_index (); + u16 *ethertype_p, ethertype; + ethernet_vlan_header_t *vlan; + + if (PREDICT_TRUE (sif != 0)) + { + bif = bond_get_master_by_sw_if_index (sif->group); + if (PREDICT_TRUE (bif != 0)) + { + if (PREDICT_TRUE (vec_len (bif->slaves) >= 1)) + { + if (PREDICT_TRUE (bif->admin_up == 1)) + { + if (!ethernet_frame_is_tagged (ntohs (eth->type))) + { + // Let some layer2 packets pass through. + if (PREDICT_TRUE ((eth->type != + htons (ETHERNET_TYPE_SLOW_PROTOCOLS)) + && !packet_is_cdp (eth) + && (eth->type != + htons + (ETHERNET_TYPE_802_1_LLDP)))) + { + // Change the physical interface to + // bond interface + vnet_buffer (b0)->sw_if_index[VLIB_RX] = + bif->sw_if_index; + + /* increase rx counters */ + vlib_increment_simple_counter + (vnet_main.interface_main.sw_if_counters + + VNET_INTERFACE_COUNTER_RX, thread_index, + bif->sw_if_index, 1); + } + else + { + vlib_error_count (vm, node->node_index, + BOND_INPUT_ERROR_PASS_THRU, 1); + } + } + else + { + vlan = (void *) (eth + 1); + ethertype_p = &vlan->type; + if (*ethertype_p == ntohs (ETHERNET_TYPE_VLAN)) + { + vlan++; + ethertype_p = &vlan->type; + } + ethertype = *ethertype_p; + if (PREDICT_TRUE ((ethertype != + htons (ETHERNET_TYPE_SLOW_PROTOCOLS)) + && (ethertype != + htons (ETHERNET_TYPE_CDP)) + && (ethertype != + htons + (ETHERNET_TYPE_802_1_LLDP)))) + { + // Change the physical interface to + // bond interface + vnet_buffer (b0)->sw_if_index[VLIB_RX] = + bif->sw_if_index; + + /* increase rx counters */ + vlib_increment_simple_counter + (vnet_main.interface_main.sw_if_counters + + VNET_INTERFACE_COUNTER_RX, thread_index, + bif->sw_if_index, 1); + } + else + { + vlib_error_count (vm, node->node_index, + BOND_INPUT_ERROR_PASS_THRU, 1); + } + } + } + else + { + vlib_error_count (vm, node->node_index, + BOND_INPUT_ERROR_IF_DOWN, 1); + } + } + else + { + vlib_error_count (vm, node->node_index, + BOND_INPUT_ERROR_NO_SLAVE, 1); + } + } + else + { + vlib_error_count (vm, node->node_index, + BOND_INPUT_ERROR_NO_BOND, 1); + } + } + else + { + vlib_error_count (vm, node->node_index, BOND_INPUT_ERROR_NO_SLAVE, 1); + } + +} + +static uword +bond_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + u32 bi0, bi1, bi2, bi3; + vlib_buffer_t *b0, *b1, *b2, *b3; + u32 next_index; + u32 *from, *to_next, n_left_from, n_left_to_next; + ethernet_header_t *eth, *eth1, *eth2, *eth3; + u32 next0, next1, next2, next3; + bond_packet_trace_t *t0; + uword n_trace = vlib_get_trace_count (vm, node); + u32 sw_if_index, sw_if_index1, sw_if_index2, sw_if_index3; + slave_if_t *sif, *sif1, *sif2, *sif3; + u16 thread_index = vlib_get_thread_index (); + + /* Vector of buffer / pkt indices we're supposed to process */ + from = vlib_frame_vector_args (frame); + + /* Number of buffers / pkts */ + n_left_from = frame->n_vectors; + + /* Speculatively send the first buffer to the last disposition we used */ + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + /* set up to enqueue to our disposition with index = next_index */ + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from >= 12 && n_left_to_next >= 4) + { + // Prefetch next iteration + { + vlib_buffer_t *b4, *b5, *b6, *b7; + + b4 = vlib_get_buffer (vm, from[4]); + b5 = vlib_get_buffer (vm, from[5]); + b6 = vlib_get_buffer (vm, from[6]); + b7 = vlib_get_buffer (vm, from[7]); + + vlib_prefetch_buffer_header (b4, STORE); + vlib_prefetch_buffer_header (b5, STORE); + vlib_prefetch_buffer_header (b6, STORE); + vlib_prefetch_buffer_header (b7, STORE); + + CLIB_PREFETCH (b4->data, CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (b5->data, CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (b6->data, CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (b7->data, CLIB_CACHE_LINE_BYTES, LOAD); + } + + next0 = 0; + next1 = 0; + next2 = 0; + next3 = 0; + + bi0 = from[0]; + bi1 = from[1]; + bi2 = from[2]; + bi3 = from[3]; + + to_next[0] = bi0; + to_next[1] = bi1; + to_next[2] = bi2; + to_next[3] = bi3; + + from += 4; + to_next += 4; + n_left_from -= 4; + n_left_to_next -= 4; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + b2 = vlib_get_buffer (vm, bi2); + b3 = vlib_get_buffer (vm, bi3); + + vnet_feature_next (vnet_buffer (b0)->sw_if_index[VLIB_RX], &next0, + b0); + vnet_feature_next (vnet_buffer (b1)->sw_if_index[VLIB_RX], &next1, + b1); + vnet_feature_next (vnet_buffer (b2)->sw_if_index[VLIB_RX], &next2, + b2); + vnet_feature_next (vnet_buffer (b3)->sw_if_index[VLIB_RX], &next3, + b3); + + eth = (ethernet_header_t *) vlib_buffer_get_current (b0); + eth1 = (ethernet_header_t *) vlib_buffer_get_current (b1); + eth2 = (ethernet_header_t *) vlib_buffer_get_current (b2); + eth3 = (ethernet_header_t *) vlib_buffer_get_current (b3); + + sw_if_index = vnet_buffer (b0)->sw_if_index[VLIB_RX]; + sw_if_index1 = vnet_buffer (b1)->sw_if_index[VLIB_RX]; + sw_if_index2 = vnet_buffer (b2)->sw_if_index[VLIB_RX]; + sw_if_index3 = vnet_buffer (b3)->sw_if_index[VLIB_RX]; + + // sw_if_index points to the physical interface + sif = bond_get_slave_by_sw_if_index (sw_if_index); + sif1 = bond_get_slave_by_sw_if_index (sw_if_index1); + sif2 = bond_get_slave_by_sw_if_index (sw_if_index2); + sif3 = bond_get_slave_by_sw_if_index (sw_if_index3); + + bond_sw_if_index_rewrite (vm, node, sif, eth, b0); + bond_sw_if_index_rewrite (vm, node, sif1, eth1, b1); + bond_sw_if_index_rewrite (vm, node, sif2, eth2, b2); + bond_sw_if_index_rewrite (vm, node, sif3, eth3, b3); + + if (PREDICT_FALSE (n_trace > 0)) + { + vlib_trace_buffer (vm, node, next0, b0, 0 /* follow_chain */ ); + vlib_set_trace_count (vm, node, --n_trace); + t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); + t0->ethernet = *eth; + t0->sw_if_index = sw_if_index; + t0->bond_sw_if_index = vnet_buffer (b0)->sw_if_index[VLIB_RX]; + + if (PREDICT_TRUE (n_trace > 0)) + { + vlib_trace_buffer (vm, node, next1, b1, + 0 /* follow_chain */ ); + vlib_set_trace_count (vm, node, --n_trace); + t0 = vlib_add_trace (vm, node, b1, sizeof (*t0)); + t0->ethernet = *eth1; + t0->sw_if_index = sw_if_index1; + t0->bond_sw_if_index = + vnet_buffer (b1)->sw_if_index[VLIB_RX]; + + if (PREDICT_TRUE (n_trace > 0)) + { + vlib_trace_buffer (vm, node, next1, b2, + 0 /* follow_chain */ ); + vlib_set_trace_count (vm, node, --n_trace); + t0 = vlib_add_trace (vm, node, b2, sizeof (*t0)); + t0->ethernet = *eth2; + t0->sw_if_index = sw_if_index2; + t0->bond_sw_if_index = + vnet_buffer (b2)->sw_if_index[VLIB_RX]; + + if (PREDICT_TRUE (n_trace > 0)) + { + vlib_trace_buffer (vm, node, next1, b2, + 0 /* follow_chain */ ); + vlib_set_trace_count (vm, node, --n_trace); + t0 = vlib_add_trace (vm, node, b3, sizeof (*t0)); + t0->ethernet = *eth3; + t0->sw_if_index = sw_if_index3; + t0->bond_sw_if_index = + vnet_buffer (b3)->sw_if_index[VLIB_RX]; + } + } + } + } + + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b1); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b2); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b3); + + /* verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x4 (vm, node, next_index, + to_next, n_left_to_next, + bi0, bi1, bi2, bi3, next0, next1, + next2, next3); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + // Prefetch next iteration + if (n_left_from > 1) + { + vlib_buffer_t *p2; + + p2 = vlib_get_buffer (vm, from[1]); + vlib_prefetch_buffer_header (p2, STORE); + CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, LOAD); + } + + next0 = 0; + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + vnet_feature_next (vnet_buffer (b0)->sw_if_index[VLIB_RX], &next0, + b0); + + eth = (ethernet_header_t *) vlib_buffer_get_current (b0); + + sw_if_index = vnet_buffer (b0)->sw_if_index[VLIB_RX]; + // sw_if_index points to the physical interface + sif = bond_get_slave_by_sw_if_index (sw_if_index); + bond_sw_if_index_rewrite (vm, node, sif, eth, b0); + + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); + + /* verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + vlib_node_increment_counter (vm, bond_input_node.index, + BOND_INPUT_ERROR_NONE, frame->n_vectors); + + vnet_device_increment_rx_packets (thread_index, frame->n_vectors); + + return frame->n_vectors; +} + +static clib_error_t * +bond_input_init (vlib_main_t * vm) +{ + return 0; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (bond_input_node) = { + .function = bond_input_fn, + .name = "bond-input", + .vector_size = sizeof (u32), + .format_buffer = format_ethernet_header_with_length, + .format_trace = format_bond_input_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = BOND_INPUT_N_ERROR, + .error_strings = bond_input_error_strings, + .n_next_nodes = 0, + .next_nodes = + { + [0] = "error-drop" + } +}; + +VLIB_INIT_FUNCTION (bond_input_init); + +VNET_FEATURE_INIT (bond_input, static) = +{ + .arc_name = "device-input", + .node_name = "bond-input", + .runs_before = VNET_FEATURES ("ethernet-input"), +}; +VLIB_NODE_FUNCTION_MULTIARCH (bond_input_node, bond_input_fn) +/* *INDENT-ON* */ + +static clib_error_t * +bond_sw_interface_up_down (vnet_main_t * vnm, u32 sw_if_index, u32 flags) +{ + bond_main_t *bm = &bond_main; + slave_if_t *sif; + vlib_main_t *vm = bm->vlib_main; + + sif = bond_get_slave_by_sw_if_index (sw_if_index); + if (sif) + { + sif->port_enabled = flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP; + if (sif->port_enabled == 0) + { + if (sif->lacp_enabled == 0) + { + bond_disable_collecting_distributing (vm, sif); + } + } + else + { + if (sif->lacp_enabled == 0) + { + bond_enable_collecting_distributing (vm, sif); + } + } + } + + return 0; +} + +VNET_SW_INTERFACE_ADMIN_UP_DOWN_FUNCTION (bond_sw_interface_up_down); + +static clib_error_t * +bond_hw_interface_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags) +{ + bond_main_t *bm = &bond_main; + slave_if_t *sif; + vnet_sw_interface_t *sw; + vlib_main_t *vm = bm->vlib_main; + vnet_interface_main_t *im = &vnm->interface_main; + + sw = pool_elt_at_index (im->sw_interfaces, hw_if_index); + sif = bond_get_slave_by_sw_if_index (sw->sw_if_index); + if (sif) + { + if (!(flags & VNET_HW_INTERFACE_FLAG_LINK_UP)) + { + if (sif->lacp_enabled == 0) + { + bond_disable_collecting_distributing (vm, sif); + } + } + else + { + if (sif->lacp_enabled == 0) + { + bond_enable_collecting_distributing (vm, sif); + } + } + } + + return 0; +} + +VNET_HW_INTERFACE_LINK_UP_DOWN_FUNCTION (bond_hw_interface_up_down); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/bonding/node.h b/src/vnet/bonding/node.h new file mode 100644 index 00000000000..74f3b1a356a --- /dev/null +++ b/src/vnet/bonding/node.h @@ -0,0 +1,451 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_vnet_bonding_node_h__ +#define __included_vnet_bonding_node_h__ + +#include <vlib/vlib.h> +#include <vlib/unix/unix.h> +#include <vppinfra/format.h> +#include <vppinfra/hash.h> +#include <vnet/ethernet/ethernet.h> +#include <vnet/interface.h> + +#define LACP_FAST_PERIODIC_TIMER 1.0 +#define LACP_SHORT_TIMOUT_TIME (LACP_FAST_PERIODIC_TIMER * 3) +#define LACP_SLOW_PERIODIC_TIMER 30.0 +#define LACP_LONG_TIMOUT_TIME (LACP_SLOW_PERIODIC_TIMER * 3) + +#ifndef MIN +#define MIN(x,y) (((x)<(y))?(x):(y)) +#endif + +#define foreach_bond_mode \ + _ (1, ROUND_ROBIN, "round-robin") \ + _ (2, ACTIVE_BACKUP, "active-backup") \ + _ (3, XOR, "xor") \ + _ (4, BROADCAST, "broadcast") \ + _ (5, LACP, "lacp") + +typedef enum +{ +#define _(v, f, s) BOND_MODE_##f = v, + foreach_bond_mode +#undef _ +} bond_mode_t; + +/* configurable load-balances */ +#define foreach_bond_lb \ + _ (2, L23, "l23", l23) \ + _ (1, l34 , "l34", l34) \ + _ (0, L2, "l2", l2) + +/* load-balance functions implemented in bond-output */ +#define foreach_bond_lb_algo \ + _ (0, L2, "l2", l2) \ + _ (1, l34 , "l34", l34) \ + _ (2, L23, "l23", l23) \ + _ (3, RR, "round-robin", round_robin) \ + _ (4, BC, "broadcast", broadcast) \ + _ (5, AB, "active-backup", active_backup) + +typedef enum +{ +#define _(v, f, s, p) BOND_LB_##f = v, + foreach_bond_lb_algo +#undef _ +} bond_load_balance_t; + +typedef struct +{ + u8 hw_addr_set; + u8 hw_addr[6]; + u8 mode; + u8 lb; + /* return */ + u32 sw_if_index; + int rv; + clib_error_t *error; +} bond_create_if_args_t; + +typedef struct +{ + /* slave's sw_if_index */ + u32 slave; + /* bond's sw_if_index */ + u32 group; + u8 is_passive; + u8 is_long_timeout; + /* return */ + int rv; + clib_error_t *error; +} bond_enslave_args_t; + +typedef struct +{ + u32 slave; + /* return */ + int rv; + clib_error_t *error; +} bond_detach_slave_args_t; + +/** BOND interface details struct */ +typedef struct +{ + u32 sw_if_index; + u8 interface_name[64]; + u8 mode; + u8 lb; + u32 active_slaves; + u32 slaves; +} bond_interface_details_t; + +/** slave interface details struct */ +typedef struct +{ + u32 sw_if_index; + u8 interface_name[64]; + u8 is_passive; + u8 is_long_timeout; + u32 active_slaves; +} slave_interface_details_t; + +typedef CLIB_PACKED (struct + { + u16 system_priority; + u8 system[6]; + u16 key; u16 port_priority; u16 port_number; + u8 state; + }) lacp_port_info_t; + +typedef struct +{ + u8 admin_up; + u8 mode; + u8 lb; + + /* the last slave index for the rr lb */ + u32 lb_rr_last_index; + + u32 dev_instance; + u32 hw_if_index; + u32 sw_if_index; + + /* Configured slaves */ + u32 *slaves; + + /* Slaves that are in DISTRIBUTING state */ + u32 *active_slaves; + + /* rapidly find an active slave */ + uword *active_slave_by_sw_if_index; + + lacp_port_info_t partner; + lacp_port_info_t actor; + u8 individual_aggregator; + + u32 group; + uword *port_number_bitmap; + u8 use_custom_mac; + u8 hw_address[6]; +} bond_if_t; + +typedef struct +{ + u8 persistent_hw_address[6]; + + /* neighbor's vlib software interface index */ + u32 sw_if_index; + + /* Neighbor time-to-live (usually 3s) */ + f32 ttl_in_seconds; + + /* 1 = interface is configured with long timeout (60s) */ + u8 is_long_timeout; + + /* 1 = debug is on; 0 = debug is off */ + u8 debug; + + /* tx packet template id for this neighbor */ + u8 packet_template_index; + + /* Info we actually keep about each neighbor */ + + /* Jenkins hash optimization: avoid tlv scan, send short keepalive msg */ + u8 last_packet_signature_valid; + uword last_packet_signature; + + /* last received lacp packet, for the J-hash optimization */ + u8 *last_rx_pkt; + + /* last marker packet */ + u8 *last_marker_pkt; + + /* neighbor vlib hw_if_index */ + u32 hw_if_index; + + /* actor does not initiate the protocol exchange */ + u8 is_passive; + + /* Partner port information */ + lacp_port_info_t partner; + lacp_port_info_t partner_admin;; + + /* Partner port information */ + lacp_port_info_t actor; + lacp_port_info_t actor_admin; + + /* Need To Transmit flag */ + u8 ntt; + + /* Link has been established and Aggregate Port is operable */ + u8 port_enabled; + + /* Initialization or reinitialization of the lacp protocol entity */ + u8 begin; + + /* Aggregation Port is operating the lacp */ + u8 lacp_enabled; + + /* MUX to indicate to the Selection Logic wait_while_timer expired */ + u8 ready_n; + + /* Selection Logic indicates al Aggregation Ports attached */ + u8 ready; + + /* Selection Logic selected an Aggregator */ + int selected; + + /* RX machine indicates an Aggregation Port in PORT_DISABLED state */ + u8 port_moved; + + /* timer used to detect whether received protocol information has expired */ + f64 current_while_timer; + + /* timer used to detect actor churn states */ + f64 actor_churn_timer; + + /* time last lacpdu was sent */ + f64 last_lacpdu_time; + + /* timer used to generate periodic transmission */ + f64 periodic_timer; + + /* timer used to detect partner churn states */ + f64 partner_churn_timer; + + /* provides hysteresis before performing an aggregation change */ + f64 wait_while_timer; + + /* Implemention variables, not in the spec */ + int rx_state; + int tx_state; + int mux_state; + int ptx_state; + + /* actor admin key */ + u32 group; + + u32 marker_tx_id; + + u32 bif_dev_instance; + + u8 loopback_port; + + /* bond mode */ + u8 mode; + + clib_spinlock_t lockp; +} slave_if_t; + +typedef void (*lacp_enable_disable_func) (vlib_main_t * vm, bond_if_t * bif, + slave_if_t * sif, u8 enable); + +typedef struct +{ + /* pool of bonding interfaces */ + bond_if_t *interfaces; + + /* pool of lacp neighbors */ + slave_if_t *neighbors; + + /* rapidly find a neighbor by vlib software interface index */ + uword *neighbor_by_sw_if_index; + + /* rapidly find a bond by vlib software interface index */ + uword *bond_by_sw_if_index; + + /* convenience variables */ + vlib_main_t *vlib_main; + vnet_main_t *vnet_main; + + /* lacp plugin is loaded */ + u8 lacp_plugin_loaded; + + lacp_enable_disable_func lacp_enable_disable; +} bond_main_t; + +/* bond packet trace capture */ +typedef struct +{ + ethernet_header_t ethernet; + u32 sw_if_index; + u32 bond_sw_if_index; +} bond_packet_trace_t; + +typedef u32 (*load_balance_func) (vlib_main_t * vm, + vlib_node_runtime_t * node, bond_if_t * bif, + vlib_buffer_t * b0); + +typedef struct +{ + load_balance_func load_balance; +} bond_load_balance_func_t; + +extern vlib_node_registration_t bond_input_node; +extern vnet_device_class_t bond_dev_class; +extern bond_main_t bond_main; + +void bond_disable_collecting_distributing (vlib_main_t * vm, + slave_if_t * sif); +void bond_enable_collecting_distributing (vlib_main_t * vm, slave_if_t * sif); +u8 *format_bond_interface_name (u8 * s, va_list * args); + +void bond_create_if (vlib_main_t * vm, bond_create_if_args_t * args); +int bond_delete_if (vlib_main_t * vm, u32 sw_if_index); +void bond_enslave (vlib_main_t * vm, bond_enslave_args_t * args); +void bond_detach_slave (vlib_main_t * vm, bond_detach_slave_args_t * args); +int bond_dump_ifs (bond_interface_details_t ** out_bondids); +int bond_dump_slave_ifs (slave_interface_details_t ** out_slaveids, + u32 bond_sw_if_index); + +static inline uword +unformat_bond_mode (unformat_input_t * input, va_list * args) +{ + u8 *r = va_arg (*args, u8 *); + + if (0); +#define _(v, f, s) else if (unformat (input, s)) *r = BOND_MODE_##f; + foreach_bond_mode +#undef _ + else + return 0; + + return 1; +} + +static inline u8 * +format_bond_mode (u8 * s, va_list * args) +{ + u32 i = va_arg (*args, u32); + u8 *t = 0; + + switch (i) + { +#define _(v, f, s) case BOND_MODE_##f: t = (u8 *) s; break; + foreach_bond_mode +#undef _ + default: + return format (s, "unknown"); + } + return format (s, "%s", t); +} + +static inline uword +unformat_bond_load_balance (unformat_input_t * input, va_list * args) +{ + u8 *r = va_arg (*args, u8 *); + + if (0); +#define _(v, f, s, p) else if (unformat (input, s)) *r = BOND_LB_##f; + foreach_bond_lb +#undef _ + else + return 0; + + return 1; +} + +static inline u8 * +format_bond_load_balance (u8 * s, va_list * args) +{ + u32 i = va_arg (*args, u32); + u8 *t = 0; + + switch (i) + { +#define _(v, f, s, p) case BOND_LB_##f: t = (u8 *) s; break; + foreach_bond_lb_algo +#undef _ + default: + return format (s, "unknown"); + } + return format (s, "%s", t); +} + +static inline void +bond_register_callback (lacp_enable_disable_func func) +{ + bond_main_t *bm = &bond_main; + + bm->lacp_plugin_loaded = 1; + bm->lacp_enable_disable = func; +} + +static inline bond_if_t * +bond_get_master_by_sw_if_index (u32 sw_if_index) +{ + bond_main_t *bm = &bond_main; + uword *p; + + p = hash_get (bm->bond_by_sw_if_index, sw_if_index); + if (!p) + { + return 0; + } + return pool_elt_at_index (bm->interfaces, p[0]); +} + +static inline bond_if_t * +bond_get_master_by_dev_instance (u32 dev_instance) +{ + bond_main_t *bm = &bond_main; + + return pool_elt_at_index (bm->interfaces, dev_instance); +} + +static inline slave_if_t * +bond_get_slave_by_sw_if_index (u32 sw_if_index) +{ + bond_main_t *bm = &bond_main; + slave_if_t *sif = 0; + uword *p; + + p = hash_get (bm->neighbor_by_sw_if_index, sw_if_index); + if (p) + { + sif = pool_elt_at_index (bm->neighbors, p[0]); + } + return sif; +} + +#endif /* __included_vnet_bonding_node_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ |