/* * snat.c - simple nat plugin * * Copyright (c) 2016 Cisco and/or its affiliates. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include snat_main_t snat_main; fib_source_t nat_fib_src_hi; fib_source_t nat_fib_src_low; /* *INDENT-OFF* */ /* Hook up input features */ VNET_FEATURE_INIT (nat_pre_in2out, static) = { .arc_name = "ip4-unicast", .node_name = "nat-pre-in2out", .runs_after = VNET_FEATURES ("acl-plugin-in-ip4-fa", "ip4-sv-reassembly-feature"), }; VNET_FEATURE_INIT (nat_pre_out2in, static) = { .arc_name = "ip4-unicast", .node_name = "nat-pre-out2in", .runs_after = VNET_FEATURES ("acl-plugin-in-ip4-fa", "ip4-dhcp-client-detect", "ip4-sv-reassembly-feature"), }; VNET_FEATURE_INIT (snat_in2out_worker_handoff, static) = { .arc_name = "ip4-unicast", .node_name = "nat44-in2out-worker-handoff", .runs_after = VNET_FEATURES ("acl-plugin-in-ip4-fa"), }; VNET_FEATURE_INIT (snat_out2in_worker_handoff, static) = { .arc_name = "ip4-unicast", .node_name = "nat44-out2in-worker-handoff", .runs_after = VNET_FEATURES ("acl-plugin-in-ip4-fa", "ip4-dhcp-client-detect"), }; VNET_FEATURE_INIT (ip4_snat_in2out, static) = { .arc_name = "ip4-unicast", .node_name = "nat44-in2out", .runs_after = VNET_FEATURES ("acl-plugin-in-ip4-fa","ip4-sv-reassembly-feature"), }; VNET_FEATURE_INIT (ip4_snat_out2in, static) = { .arc_name = "ip4-unicast", .node_name = "nat44-out2in", .runs_after = VNET_FEATURES ("acl-plugin-in-ip4-fa","ip4-sv-reassembly-feature", "ip4-dhcp-client-detect"), }; VNET_FEATURE_INIT (ip4_nat_classify, static) = { .arc_name = "ip4-unicast", .node_name = "nat44-classify", .runs_after = VNET_FEATURES ("acl-plugin-in-ip4-fa","ip4-sv-reassembly-feature"), }; VNET_FEATURE_INIT (ip4_snat_det_in2out, static) = { .arc_name = "ip4-unicast", .node_name = "nat44-det-in2out", .runs_after = VNET_FEATURES ("acl-plugin-in-ip4-fa","ip4-sv-reassembly-feature"), }; VNET_FEATURE_INIT (ip4_snat_det_out2in, static) = { .arc_name = "ip4-unicast", .node_name = "nat44-det-out2in", .runs_after = VNET_FEATURES ("acl-plugin-in-ip4-fa","ip4-sv-reassembly-feature", "ip4-dhcp-client-detect"), }; VNET_FEATURE_INIT (ip4_nat_det_classify, static) = { .arc_name = "ip4-unicast", .node_name = "nat44-det-classify", .runs_after = VNET_FEATURES ("acl-plugin-in-ip4-fa","ip4-sv-reassembly-feature"), }; VNET_FEATURE_INIT (ip4_nat44_ed_in2out, static) = { .arc_name = "ip4-unicast", .node_name = "nat44-ed-in2out", .runs_after = VNET_FEATURES ("acl-plugin-in-ip4-fa","ip4-sv-reassembly-feature"), }; VNET_FEATURE_INIT (ip4_nat44_ed_out2in, static) = { .arc_name = "ip4-unicast", .node_name = "nat44-ed-out2in", .runs_after = VNET_FEATURES ("acl-plugin-in-ip4-fa","ip4-sv-reassembly-feature", "ip4-dhcp-client-detect"), }; VNET_FEATURE_INIT (ip4_nat44_ed_classify, static) = { .arc_name = "ip4-unicast", .node_name = "nat44-ed-classify", .runs_after = VNET_FEATURES ("acl-plugin-in-ip4-fa","ip4-sv-reassembly-feature"), }; VNET_FEATURE_INIT (ip4_nat_handoff_classify, static) = { .arc_name = "ip4-unicast", .node_name = "nat44-handoff-classify", .runs_after = VNET_FEATURES ("acl-plugin-in-ip4-fa","ip4-sv-reassembly-feature"), }; VNET_FEATURE_INIT (ip4_snat_in2out_fast, static) = { .arc_name = "ip4-unicast", .node_name = "nat44-in2out-fast", .runs_after = VNET_FEATURES ("acl-plugin-in-ip4-fa","ip4-sv-reassembly-feature"), }; VNET_FEATURE_INIT (ip4_snat_out2in_fast, static) = { .arc_name = "ip4-unicast", .node_name = "nat44-out2in-fast", .runs_after = VNET_FEATURES ("acl-plugin-in-ip4-fa","ip4-sv-reassembly-feature", "ip4-dhcp-client-detect"), }; VNET_FEATURE_INIT (ip4_snat_hairpin_dst, static) = { .arc_name = "ip4-unicast", .node_name = "nat44-hairpin-dst", .runs_after = VNET_FEATURES ("acl-plugin-in-ip4-fa","ip4-sv-reassembly-feature"), }; VNET_FEATURE_INIT (ip4_nat44_ed_hairpin_dst, static) = { .arc_name = "ip4-unicast", .node_name = "nat44-ed-hairpin-dst", .runs_after = VNET_FEATURES ("acl-plugin-in-ip4-fa","ip4-sv-reassembly-feature"), }; /* Hook up output features */ VNET_FEATURE_INIT (ip4_snat_in2out_output, static) = { .arc_name = "ip4-output", .node_name = "nat44-in2out-output", .runs_after = VNET_FEATURES ("acl-plugin-out-ip4-fa","ip4-sv-reassembly-output-feature"), }; VNET_FEATURE_INIT (ip4_snat_in2out_output_worker_handoff, static) = { .arc_name = "ip4-output", .node_name = "nat44-in2out-output-worker-handoff", .runs_after = VNET_FEATURES ("acl-plugin-out-ip4-fa","ip4-sv-reassembly-output-feature"), }; VNET_FEATURE_INIT (ip4_snat_hairpin_src, static) = { .arc_name = "ip4-output", .node_name = "nat44-hairpin-src", .runs_after = VNET_FEATURES ("acl-plugin-out-ip4-fa","ip4-sv-reassembly-output-feature"), }; VNET_FEATURE_INIT (ip4_nat44_ed_in2out_output, static) = { .arc_name = "ip4-output", .node_name = "nat44-ed-in2out-output", .runs_after = VNET_FEATURES ("ip4-sv-reassembly-output-feature"), .runs_before = VNET_FEATURES ("acl-plugin-out-ip4-fa"), }; VNET_FEATURE_INIT (ip4_nat44_ed_hairpin_src, static) = { .arc_name = "ip4-output", .node_name = "nat44-ed-hairpin-src", .runs_after = VNET_FEATURES ("ip4-sv-reassembly-output-feature"), .runs_before = VNET_FEATURES ("acl-plugin-out-ip4-fa"), }; /* Hook up ip4-local features */ VNET_FEATURE_INIT (ip4_nat_hairpinning, static) = { .arc_name = "ip4-local", .node_name = "nat44-hairpinning", .runs_before = VNET_FEATURES("ip4-local-end-of-arc"), }; VNET_FEATURE_INIT (ip4_nat44_ed_hairpinning, static) = { .arc_name = "ip4-local", .node_name = "nat44-ed-hairpinning", .runs_before = VNET_FEATURES("ip4-local-end-of-arc"), }; VLIB_PLUGIN_REGISTER () = { .version = VPP_BUILD_VER, .description = "Network Address Translation (NAT)", }; /* *INDENT-ON* */ void nat_free_session_data (snat_main_t * sm, snat_session_t * s, u32 thread_index, u8 is_ha) { snat_session_key_t key; clib_bihash_kv_8_8_t kv; u8 proto; u16 r_port, l_port; ip4_address_t *l_addr, *r_addr; u32 fib_index = 0; clib_bihash_kv_16_8_t ed_kv; snat_main_per_thread_data_t *tsm = vec_elt_at_index (sm->per_thread_data, thread_index); if (is_fwd_bypass_session (s)) { if (snat_is_unk_proto_session (s)) { make_ed_kv (&s->in2out.addr, &s->ext_host_addr, s->in2out.port, 0, 0, 0, ~0, ~0, &ed_kv); } else { l_port = s->in2out.port; r_port = s->ext_host_port; l_addr = &s->in2out.addr; r_addr = &s->ext_host_addr; proto = nat_proto_to_ip_proto (s->in2out.protocol); make_ed_kv (l_addr, r_addr, proto, fib_index, l_port, r_port, ~0, ~0, &ed_kv); } if (clib_bihash_add_del_16_8 (&tsm->in2out_ed, &ed_kv, 0)) nat_elog_warn ("in2out_ed key del failed"); return; } /* session lookup tables */ if (is_ed_session (s)) { if (is_affinity_sessions (s)) nat_affinity_unlock (s->ext_host_addr, s->out2in.addr, s->in2out.protocol, s->out2in.port); l_addr = &s->out2in.addr; r_addr = &s->ext_host_addr; fib_index = s->out2in.fib_index; if (snat_is_unk_proto_session (s)) { proto = s->in2out.port; r_port = 0; l_port = 0; } else { proto = nat_proto_to_ip_proto (s->in2out.protocol); l_port = s->out2in.port; r_port = s->ext_host_port; } make_ed_kv (l_addr, r_addr, proto, fib_index, l_port, r_port, ~0, ~0, &ed_kv); if (clib_bihash_add_del_16_8 (&sm->out2in_ed, &ed_kv, 0)) nat_elog_warn ("out2in_ed key del failed"); l_addr = &s->in2out.addr; fib_index = s->in2out.fib_index; if (!snat_is_unk_proto_session (s)) l_port = s->in2out.port; if (is_twice_nat_session (s)) { r_addr = &s->ext_host_nat_addr; r_port = s->ext_host_nat_port; } make_ed_kv (l_addr, r_addr, proto, fib_index, l_port, r_port, ~0, ~0, &ed_kv); if (clib_bihash_add_del_16_8 (&tsm->in2out_ed, &ed_kv, 0)) nat_elog_warn ("in2out_ed key del failed"); if (!is_ha) nat_syslog_nat44_sdel (s->user_index, s->in2out.fib_index, &s->in2out.addr, s->in2out.port, &s->ext_host_nat_addr, s->ext_host_nat_port, &s->out2in.addr, s->out2in.port, &s->ext_host_addr, s->ext_host_port, s->in2out.protocol, is_twice_nat_session (s)); } else { kv.key = s->in2out.as_u64; if (clib_bihash_add_del_8_8 (&tsm->in2out, &kv, 0)) nat_elog_warn ("in2out key del failed"); kv.key = s->out2in.as_u64; if (clib_bihash_add_del_8_8 (&tsm->out2in, &kv, 0)) nat_elog_warn ("out2in key del failed"); if (!is_ha) nat_syslog_nat44_apmdel (s->user_index, s->in2out.fib_index, &s->in2out.addr, s->in2out.port, &s->out2in.addr, s->out2in.port, s->in2out.protocol); } if (snat_is_unk_proto_session (s)) return; if (!is_ha) { /* log NAT event */ snat_ipfix_logging_nat44_ses_delete (thread_index, s->in2out.addr.as_u32, s->out2in.addr.as_u32, s->in2out.protocol, s->in2out.port, s->out2in.port, s->in2out.fib_index); nat_ha_sdel (&s->out2in.addr, s->out2in.port, &s->ext_host_addr, s->ext_host_port, s->out2in.protocol, s->out2in.fib_index, thread_index); } /* Twice NAT address and port for external host */ if (is_twice_nat_session (s)) { key.protocol = s->in2out.protocol; key.port = s->ext_host_nat_port; key.addr.as_u32 = s->ext_host_nat_addr.as_u32; snat_free_outside_address_and_port (sm->twice_nat_addresses, thread_index, &key); } if (snat_is_session_static (s)) return; snat_free_outside_address_and_port (sm->addresses, thread_index, &s->out2in); } int nat44_set_session_limit (u32 session_limit, u32 vrf_id) { snat_main_t *sm = &snat_main; u32 fib_index = fib_table_find (FIB_PROTOCOL_IP4, vrf_id); u32 len = vec_len (sm->max_translations_per_fib); if (len <= fib_index) { vec_validate (sm->max_translations_per_fib, fib_index + 1); for (; len < vec_len (sm->max_translations_per_fib); len++) sm->max_translations_per_fib[len] = sm->max_translations; } sm->max_translations_per_fib[fib_index] = session_limit; return 0; } void nat44_free_session_data (snat_main_t * sm, snat_session_t * s, u32 thread_index, u8 is_ha) { snat_session_key_t key; u8 proto; u16 r_port, l_port; ip4_address_t *l_addr, *r_addr; u32 fib_index; clib_bihash_kv_16_8_t ed_kv; snat_main_per_thread_data_t *tsm = vec_elt_at_index (sm->per_thread_data, thread_index); if (is_fwd_bypass_session (s)) { if (snat_is_unk_proto_session (s)) { proto = s->in2out.port; r_port = 0; l_port = 0; } else { proto = nat_proto_to_ip_proto (s->in2out.protocol); l_port = s->in2out.port; r_port = s->ext_host_port; } l_addr = &s->in2out.addr; r_addr = &s->ext_host_addr; fib_index = 0; make_ed_kv (l_addr, r_addr, proto, fib_index, l_port, r_port, ~0, ~0, &ed_kv); if (PREDICT_FALSE (clib_bihash_add_del_16_8 (&tsm->in2out_ed, &ed_kv, 0))) nat_elog_warn ("in2out_ed key del failed"); return; } /* session lookup tables */ if (is_affinity_sessions (s)) nat_affinity_unlock (s->ext_host_addr, s->out2in.addr, s->in2out.protocol, s->out2in.port); l_addr = &s->out2in.addr; r_addr = &s->ext_host_addr; fib_index = s->out2in.fib_index; if (snat_is_unk_proto_session (s)) { proto = s->in2out.port; r_port = 0; l_port = 0; } else { proto = nat_proto_to_ip_proto (s->in2out.protocol); l_port = s->out2in.port; r_port = s->ext_host_port; } make_ed_kv (l_addr, r_addr, proto, fib_index, l_port, r_port, ~0, ~0, &ed_kv); if (PREDICT_FALSE (clib_bihash_add_del_16_8 (&sm->out2in_ed, &ed_kv, 0))) nat_elog_warn ("out2in_ed key del failed"); l_addr = &s->in2out.addr; fib_index = s->in2out.fib_index; if (!snat_is_unk_proto_session (s)) l_port = s->in2out.port; if (is_twice_nat_session (s)) { r_addr = &s->ext_host_nat_addr; r_port = s->ext_host_nat_port; } make_ed_kv (l_addr, r_addr, proto, fib_index, l_port, r_port, ~0, ~0, &ed_kv); if (PREDICT_FALSE (clib_bihash_add_del_16_8 (&tsm->in2out_ed, &ed_kv, 0))) nat_elog_warn ("in2out_ed key del failed"); if (!is_ha) { nat_syslog_nat44_sdel (s->user_index, s->in2out.fib_index, &s->in2out.addr, s->in2out.port, &s->ext_host_nat_addr, s->ext_host_nat_port, &s->out2in.addr, s->out2in.port, &s->ext_host_addr, s->ext_host_port, s->in2out.protocol, is_twice_nat_session (s)); } if (snat_is_unk_proto_session (s)) return; if (!is_ha) { snat_ipfix_logging_nat44_ses_delete (thread_index, s->in2out.addr.as_u32, s->out2in.addr.as_u32, s->in2out.protocol, s->in2out.port, s->out2in.port, s->in2out.fib_index); nat_ha_sdel (&s->out2in.addr, s->out2in.port, &s->ext_host_addr, s->ext_host_port, s->out2in.protocol, s->out2in.fib_index, thread_index); } /* Twice NAT address and port for external host */ if (is_twice_nat_session (s)) { key.protocol = s->in2out.protocol; key.port = s->ext_host_nat_port; key.addr.as_u32 = s->ext_host_nat_addr.as_u32; snat_free_outside_address_and_port (sm->twice_nat_addresses, thread_index, &key); } if (snat_is_session_static (s)) return; snat_free_outside_address_and_port (sm->addresses, thread_index, &s->out2in); } snat_user_t * nat_user_get_or_create (snat_main_t * sm, ip4_address_t * addr, u32 fib_index, u32 thread_index) { snat_user_t *u = 0; snat_user_key_t user_key; clib_bihash_kv_8_8_t kv, value; snat_main_per_thread_data_t *tsm = &sm->per_thread_data[thread_index]; dlist_elt_t *per_user_list_head_elt; user_key.addr.as_u32 = addr->as_u32; user_key.fib_index = fib_index; kv.key = user_key.as_u64; /* Ever heard of the "user" = src ip4 address before? */ if (clib_bihash_search_8_8 (&tsm->user_hash, &kv, &value)) { /* no, make a new one */ pool_get (tsm->users, u); clib_memset (u, 0, sizeof (*u)); u->addr.as_u32 = addr->as_u32; u->fib_index = fib_index; pool_get (tsm->list_pool, per_user_list_head_elt); u->sessions_per_user_list_head_index = per_user_list_head_elt - tsm->list_pool; clib_dlist_init (tsm->list_pool, u->sessions_per_user_list_head_index); kv.value = u - tsm->users; /* add user */ if (clib_bihash_add_del_8_8 (&tsm->user_hash, &kv, 1)) { nat_elog_warn ("user_hash key add failed"); nat44_delete_user_with_no_session (sm, u, thread_index); return NULL; } vlib_set_simple_counter (&sm->total_users, thread_index, 0, pool_elts (tsm->users)); } else { u = pool_elt_at_index (tsm->users, value.value); } return u; } snat_session_t * nat_session_alloc_or_recycle (snat_main_t * sm, snat_user_t * u, u32 thread_index, f64 now) { snat_session_t *s; snat_main_per_thread_data_t *tsm = &sm->per_thread_data[thread_index]; u32 oldest_per_user_translation_list_index, session_index; dlist_elt_t *oldest_per_user_translation_list_elt; dlist_elt_t *per_user_translation_list_elt; /* Over quota? Recycle the least recently used translation */ if ((u->nsessions + u->nstaticsessions) >= sm->max_translations_per_user) { oldest_per_user_translation_list_index = clib_dlist_remove_head (tsm->list_pool, u->sessions_per_user_list_head_index); ASSERT (oldest_per_user_translation_list_index != ~0); /* Add it back to the end of the LRU list */ clib_dlist_addtail (tsm->list_pool, u->sessions_per_user_list_head_index, oldest_per_user_translation_list_index); /* Get the list element */ oldest_per_user_translation_list_elt = pool_elt_at_index (tsm->list_pool, oldest_per_user_translation_list_index); /* Get the session index from the list element */ session_index = oldest_per_user_translation_list_elt->value; /* Get the session */ s = pool_elt_at_index (tsm->sessions, session_index); nat_free_session_data (sm, s, thread_index, 0); if (snat_is_session_static (s)) u->nstaticsessions--; else u->nsessions--; s->flags = 0; s->total_bytes = 0; s->total_pkts = 0; s->state = 0; s->ext_host_addr.as_u32 = 0; s->ext_host_port = 0; s->ext_host_nat_addr.as_u32 = 0; s->ext_host_nat_port = 0; } else { pool_get (tsm->sessions, s); clib_memset (s, 0, sizeof (*s)); /* Create list elts */ pool_get (tsm->list_pool, per_user_translation_list_elt); clib_dlist_init (tsm->list_pool, per_user_translation_list_elt - tsm->list_pool); per_user_translation_list_elt->value = s - tsm->sessions; s->per_user_index = per_user_translation_list_elt - tsm->list_pool; s->per_user_list_head_index = u->sessions_per_user_list_head_index; clib_dlist_addtail (tsm->list_pool, s->per_user_list_head_index, per_user_translation_list_elt - tsm->list_pool); s->user_index = u - tsm->users; vlib_set_simple_counter (&sm->total_sessions, thread_index, 0, pool_elts (tsm->sessions)); } s->ha_last_refreshed = now; return s; } void snat_add_del_addr_to_fib (ip4_address_t * addr, u8 p_len, u32 sw_if_index, int is_add) { fib_prefix_t prefix = { .fp_len = p_len, .fp_proto = FIB_PROTOCOL_IP4, .fp_addr = { .ip4.as_u32 = addr->as_u32, }, }; u32 fib_index = ip4_fib_table_get_index_for_sw_if_index (sw_if_index); if (is_add) fib_table_entry_update_one_path (fib_index, &prefix, nat_fib_src_low, (FIB_ENTRY_FLAG_CONNECTED | FIB_ENTRY_FLAG_LOCAL | FIB_ENTRY_FLAG_EXCLUSIVE), DPO_PROTO_IP4, NULL, sw_if_index, ~0, 1, NULL, FIB_ROUTE_PATH_FLAG_NONE); else fib_table_entry_delete (fib_index, &prefix, nat_fib_src_low); } int snat_add_address (snat_main_t * sm, ip4_address_t * addr, u32 vrf_id, u8 twice_nat) { snat_address_t *ap; snat_interface_t *i; vlib_thread_main_t *tm = vlib_get_thread_main (); if (twice_nat && !sm->endpoint_dependent) return VNET_API_ERROR_FEATURE_DISABLED; /* Check if address already exists */ /* *INDENT-OFF* */ vec_foreach (ap, twice_nat ? sm->twice_nat_addresses : sm->addresses) { if (ap->addr.as_u32 == addr->as_u32) return VNET_API_ERROR_VALUE_EXIST; } /* *INDENT-ON* */ if (twice_nat) vec_add2 (sm->twice_nat_addresses, ap, 1); else vec_add2 (sm->addresses, ap, 1); ap->addr = *addr; if (vrf_id != ~0) ap->fib_index = fib_table_find_or_create_and_lock (FIB_PROTOCOL_IP4, vrf_id, nat_fib_src_low); else ap->fib_index = ~0; #define _(N, i, n, s) \ clib_memset(ap->busy_##n##_port_refcounts, 0, sizeof(ap->busy_##n##_port_refcounts));\ ap->busy_##n##_ports = 0; \ ap->busy_##n##_ports_per_thread = 0;\ vec_validate_init_empty (ap->busy_##n##_ports_per_thread, tm->n_vlib_mains - 1, 0); foreach_nat_protocol #undef _ if (twice_nat) return 0; /* Add external address to FIB */ /* *INDENT-OFF* */ pool_foreach (i, sm->interfaces, ({ if (nat_interface_is_inside(i) || sm->out2in_dpo) continue; snat_add_del_addr_to_fib(addr, 32, i->sw_if_index, 1); break; })); pool_foreach (i, sm->output_feature_interfaces, ({ if (nat_interface_is_inside(i) || sm->out2in_dpo) continue; snat_add_del_addr_to_fib(addr, 32, i->sw_if_index, 1); break; })); /* *INDENT-ON* */ return 0; } static int is_snat_address_used_in_static_mapping (snat_main_t * sm, ip4_address_t addr) { snat_static_mapping_t *m; /* *INDENT-OFF* */ pool_foreach (m, sm->static_mappings, ({ if (is_addr_only_static_mapping (m) || is_out2in_only_static_mapping (m) || is_identity_static_mapping (m)) continue; if (m->external_addr.as_u32 == addr.as_u32) return 1; })); /* *INDENT-ON* */ return 0; } static void snat_add_static_mapping_when_resolved (snat_main_t * sm, ip4_address_t l_addr, u16 l_port, u32 sw_if_index, u16 e_port, u32 vrf_id, nat_protocol_t proto, int addr_only, int is_add, u8 * tag, int twice_nat, int out2in_only, int identity_nat) { snat_static_map_resolve_t *rp; vec_add2 (sm->to_resolve, rp, 1); rp->l_addr.as_u32 = l_addr.as_u32; rp->l_port = l_port; rp->sw_if_index = sw_if_index; rp->e_port = e_port; rp->vrf_id = vrf_id; rp->proto = proto; rp->addr_only = addr_only; rp->is_add = is_add; rp->twice_nat = twice_nat; rp->out2in_only = out2in_only; rp->identity_nat = identity_nat; rp->tag = vec_dup (tag); } static u32 get_thread_idx_by_port (u16 e_port) { snat_main_t *sm = &snat_main; u32 thread_idx = sm->num_workers; if (sm->num_workers > 1) { thread_idx = sm->first_worker_index + sm->workers[(e_port - 1024) / sm->port_per_thread]; } return thread_idx; } void snat_static_mapping_del_sessions (snat_main_t * sm, snat_main_per_thread_data_t * tsm, snat_user_key_t u_key, int addr_only, ip4_address_t e_addr, u16 e_port) { clib_bihash_kv_8_8_t kv, value; kv.key = u_key.as_u64; u64 user_index; dlist_elt_t *head, *elt; snat_user_t *u; snat_session_t *s; u32 elt_index, head_index, ses_index; if (!clib_bihash_search_8_8 (&tsm->user_hash, &kv, &value)) { user_index = value.value; u = pool_elt_at_index (tsm->users, user_index); if (u->nstaticsessions) { head_index = u->sessions_per_user_list_head_index; head = pool_elt_at_index (tsm->list_pool, head_index); elt_index = head->next; elt = pool_elt_at_index (tsm->list_pool, elt_index); ses_index = elt->value; while (ses_index != ~0) { s = pool_elt_at_index (tsm->sessions, ses_index); elt = pool_elt_at_index (tsm->list_pool, elt->next); ses_index = elt->value; if (!addr_only) { if ((s->out2in.addr.as_u32 != e_addr.as_u32) || (clib_net_to_host_u16 (s->out2in.port) != e_port)) continue; } if (is_lb_session (s)) continue; if (!snat_is_session_static (s)) continue; nat_free_session_data (sm, s, tsm - sm->per_thread_data, 0); nat44_delete_session (sm, s, tsm - sm->per_thread_data); if (!addr_only) break; } } } } void snat_ed_static_mapping_del_sessions (snat_main_t * sm, snat_main_per_thread_data_t * tsm, ip4_address_t l_addr, u16 l_port, u8 protocol, u32 fib_index, int addr_only, ip4_address_t e_addr, u16 e_port) { snat_session_t *s; u32 *indexes_to_free = NULL; /* *INDENT-OFF* */ pool_foreach (s, tsm->sessions, { if (s->in2out.fib_index != fib_index || s->in2out.addr.as_u32 != l_addr.as_u32) { continue; } if (!addr_only) { if ((s->out2in.addr.as_u32 != e_addr.as_u32) || (clib_net_to_host_u16 (s->out2in.port) != e_port) || clib_net_to_host_u16 (s->in2out.port) != l_port || s->in2out.protocol != protocol) continue; } if (is_lb_session (s)) continue; if (!snat_is_session_static (s)) continue; nat_free_session_data (sm, s, tsm - sm->per_thread_data, 0); vec_add1 (indexes_to_free, s - tsm->sessions); if (!addr_only) break; }); /* *INDENT-ON* */ u32 *ses_index; vec_foreach (ses_index, indexes_to_free) { s = pool_elt_at_index (tsm->sessions, *ses_index); nat_ed_session_delete (sm, s, tsm - sm->per_thread_data, 1); } vec_free (indexes_to_free); } int snat_add_static_mapping (ip4_address_t l_addr, ip4_address_t e_addr, u16 l_port, u16 e_port, u32 vrf_id, int addr_only, u32 sw_if_index, nat_protocol_t proto, int is_add, twice_nat_type_t twice_nat, u8 out2in_only, u8 * tag, u8 identity_nat) { snat_main_t *sm = &snat_main; snat_static_mapping_t *m; snat_session_key_t m_key; clib_bihash_kv_8_8_t kv, value; snat_address_t *a = 0; u32 fib_index = ~0; snat_interface_t *interface; int i; snat_main_per_thread_data_t *tsm; snat_user_key_t u_key; snat_user_t *u; dlist_elt_t *head, *elt; u32 elt_index, head_index; u32 ses_index; u64 user_index; snat_session_t *s; snat_static_map_resolve_t *rp, *rp_match = 0; nat44_lb_addr_port_t *local; u32 find = ~0; if (!sm->endpoint_dependent) { if (twice_nat || out2in_only) return VNET_API_ERROR_FEATURE_DISABLED; } /* If the external address is a specific interface address */ if (sw_if_index != ~0) { ip4_address_t *first_int_addr; for (i = 0; i < vec_len (sm->to_resolve); i++) { rp = sm->to_resolve + i; if (rp->sw_if_index != sw_if_index || rp->l_addr.as_u32 != l_addr.as_u32 || rp->vrf_id != vrf_id || rp->addr_only != addr_only) continue; if (!addr_only) { if ((rp->l_port != l_port && rp->e_port != e_port) || rp->proto != proto) continue; } rp_match = rp; break; } /* Might be already set... */ first_int_addr = ip4_interface_first_address (sm->ip4_main, sw_if_index, 0 /* just want the address */ ); if (is_add) { if (rp_match) return VNET_API_ERROR_VALUE_EXIST; snat_add_static_mapping_when_resolved (sm, l_addr, l_port, sw_if_index, e_port, vrf_id, proto, addr_only, is_add, tag, twice_nat, out2in_only, identity_nat); /* DHCP resolution required? */ if (first_int_addr == 0) { return 0; } else { e_addr.as_u32 = first_int_addr->as_u32; /* Identity mapping? */ if (l_addr.as_u32 == 0) l_addr.as_u32 = e_addr.as_u32; } } else { if (!rp_match) return VNET_API_ERROR_NO_SUCH_ENTRY; vec_del1 (sm->to_resolve, i); if (first_int_addr) { e_addr.as_u32 = first_int_addr->as_u32; /* Identity mapping? */ if (l_addr.as_u32 == 0) l_addr.as_u32 = e_addr.as_u32; } else return 0; } } m_key.addr = e_addr; m_key.port = addr_only ? 0 : e_port; m_key.protocol = addr_only ? 0 : proto; m_key.fib_index = 0; kv.key = m_key.as_u64; if (clib_bihash_search_8_8 (&sm->static_mapping_by_external, &kv, &value)) m = 0; else m = pool_elt_at_index (sm->static_mappings, value.value); if (is_add) { if (m) { if (is_identity_static_mapping (m)) { /* *INDENT-OFF* */ pool_foreach (local, m->locals, ({ if (local->vrf_id == vrf_id) return VNET_API_ERROR_VALUE_EXIST; })); /* *INDENT-ON* */ pool_get (m->locals, local); local->vrf_id = vrf_id; local->fib_index = fib_table_find_or_create_and_lock (FIB_PROTOCOL_IP4, vrf_id, nat_fib_src_low); m_key.addr = m->local_addr; m_key.port = m->local_port; m_key.protocol = m->proto; m_key.fib_index = local->fib_index; kv.key = m_key.as_u64; kv.value = m - sm->static_mappings; clib_bihash_add_del_8_8 (&sm->static_mapping_by_local, &kv, 1); return 0; } else return VNET_API_ERROR_VALUE_EXIST; } if (twice_nat && addr_only) return VNET_API_ERROR_UNSUPPORTED; /* Convert VRF id to FIB index */ if (vrf_id != ~0) fib_i
/*
 * mpls_lookup.c: MPLS lookup
 *
 * Copyright (c) 2012-2014 Cisco and/or its affiliates.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <vlib/vlib.h>
#include <vnet/pg/pg.h>
#include <vnet/mpls/mpls_lookup.h>
#include <vnet/fib/mpls_fib.h>
#include <vnet/dpo/load_balance_map.h>
#include <vnet/dpo/replicate_dpo.h>

/**
 * Static MPLS VLIB forwarding node
 */
static vlib_node_registration_t mpls_lookup_node;

/**
 * The arc/edge from the MPLS lookup node to the MPLS replicate node
 */
u32 mpls_lookup_to_replicate_edge;

typedef struct {
  u32 next_index;
  u32 lb_index;
  u32 lfib_index;
  u32 label_net_byte_order;
  u32 hash;
} mpls_lookup_trace_t;

static u8 *
format_mpls_lookup_trace (u8 * s, va_list * args)
{
  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
  mpls_lookup_trace_t * t = va_arg (*args, mpls_lookup_trace_t *);

  s = format (s, "MPLS: next [%d], lookup fib index %d, LB index %d hash %x "
              "label %d eos %d", 
              t->next_index, t->lfib_index, t->lb_index, t->hash,
              vnet_mpls_uc_get_label(
                  clib_net_to_host_u32(t->label_net_byte_order)),
              vnet_mpls_uc_get_s(
                  clib_net_to_host_u32(t->label_net_byte_order)));
  return s;
}

static inline uword
mpls_lookup (vlib_main_t * vm,
             vlib_node_runtime_t * node,
             vlib_frame_t * from_frame)
{
  vlib_combined_counter_main_t * cm = &load_balance_main.lbm_to_counters;
  u32 n_left_from, next_index, * from, * to_next;
  mpls_main_t * mm = &mpls_main;
  u32 thread_index = vlib_get_thread_index();

  from = vlib_frame_vector_args (from_frame);
  n_left_from = from_frame->n_vectors;
  next_index = node->cached_next_index;

  while (n_left_from > 0)
    {
      u32 n_left_to_next;

      vlib_get_next_frame (vm, node, next_index,
                           to_next, n_left_to_next);

      while (n_left_from >= 8 && n_left_to_next >= 4)
        {
          u32 lbi0, next0, lfib_index0, bi0, hash_c0;
          const mpls_unicast_header_t * h0;
          const load_balance_t *lb0;
          const dpo_id_t *dpo0;
          vlib_buffer_t * b0;
          u32 lbi1, next1, lfib_index1, bi1, hash_c1;
          const mpls_unicast_header_t * h1;
          const load_balance_t *lb1;
          const dpo_id_t *dpo1;
          vlib_buffer_t * b1;
          u32 lbi2, next2, lfib_index2, bi2, hash_c2;
          const mpls_unicast_header_t * h2;
          const load_balance_t *lb2;
          const dpo_id_t *dpo2;
          vlib_buffer_t * b2;
          u32 lbi3, next3, lfib_index3, bi3, hash_c3;
          const mpls_unicast_header_t * h3;
          const load_balance_t *lb3;
          const dpo_id_t *dpo3;
          vlib_buffer_t * b3;

           /* Prefetch next iteration. */
          {
              vlib_buffer_t *p4, *p5, *p6, *p7;

            p4 = vlib_get_buffer (vm, from[4]);
            p5 = vlib_get_buffer (vm, from[5]);
            p6 = vlib_get_buffer (vm, from[6]);
            p7 = vlib_get_buffer (vm, from[7]);

            vlib_prefetch_buffer_header (p4, STORE);
            vlib_prefetch_buffer_header (p5, STORE);
            vlib_prefetch_buffer_header (p6, STORE);
            vlib_prefetch_buffer_header (p7, STORE);

            CLIB_PREFETCH (p4->data, sizeof (h0[0]), LOAD);
            CLIB_PREFETCH (p5->data, sizeof (h0[0]), LOAD);
            CLIB_PREFETCH (p6->data, sizeof (h0[0]), LOAD);
            CLIB_PREFETCH (p7->data, sizeof (h0[0]), LOAD);
          }

          bi0 = to_next[0] = from[0];
          bi1 = to_next[1] = from[1];
          bi2 = to_next[2] = from[2];
          bi3 = to_next[3] = from[3];

          from += 4;
          n_left_from -= 4;
          to_next += 4;
          n_left_to_next -= 4;

          b0 = vlib_get_buffer (vm, bi0);
          b1 = vlib_get_buffer (vm, bi1);
          b2 = vlib_get_buffer (vm, bi2);
          b3 = vlib_get_buffer (vm, bi3);
          h0 = vlib_buffer_get_current (b0);
          h1 = vlib_buffer_get_current (b1);
          h2 = vlib_buffer_get_current (b2);
          h3 = vlib_buffer_get_current (b3);

          lfib_index0 = vec_elt(mm->fib_index_by_sw_if_index,
                                vnet_buffer(b0)->sw_if_index[VLIB_RX]);
          lfib_index1 = vec_elt(mm->fib_index_by_sw_if_index,
                                vnet_buffer(b1)->sw_if_index[VLIB_RX]);
          lfib_index2 = vec_elt(mm->fib_index_by_sw_if_index,
                                vnet_buffer(b2)->sw_if_index[VLIB_RX]);
          lfib_index3 = vec_elt(mm->fib_index_by_sw_if_index,
                                vnet_buffer(b3)->sw_if_index[VLIB_RX]);

          lbi0 = mpls_fib_table_forwarding_lookup (lfib_index0, h0);
          lbi1 = mpls_fib_table_forwarding_lookup (lfib_index1, h1);
          lbi2 = mpls_fib_table_forwarding_lookup (lfib_index2, h2);
          lbi3 = mpls_fib_table_forwarding_lookup (lfib_index3, h3);

          hash_c0 = vnet_buffer(b0)->ip.flow_hash = 0;
          hash_c1 = vnet_buffer(b1)->ip.flow_hash = 0;
          hash_c2 = vnet_buffer(b2)->ip.flow_hash = 0;
          hash_c3 = vnet_buffer(b3)->ip.flow_hash = 0;

          if (MPLS_IS_REPLICATE & lbi0)
          {
              next0 = mpls_lookup_to_replicate_edge;
              vnet_buffer (b0)->ip.adj_index[VLIB_TX] =
                  (lbi0 & ~MPLS_IS_REPLICATE);
          }
          else
          {
              lb0 = load_balance_get(lbi0);
              ASSERT (lb0->lb_n_buckets > 0);
              ASSERT (is_pow2 (lb0->lb_n_buckets));

              if (PREDICT_FALSE(lb0->lb_n_buckets > 1))
              {
                  hash_c0 = vnet_buffer (b0)->ip.flow_hash =
                      mpls_compute_flow_hash(h0, lb0->lb_hash_config);
                  dpo0 = load_balance_get_fwd_bucket
                      (lb0,
                       (hash_c0 & (lb0->lb_n_buckets_minus_1)));
              }
              else
              {
                  dpo0 = load_balance_get_bucket_i (lb0, 0);
              }
              next0 = dpo0->dpoi_next_node;

              vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;

              vlib_increment_combined_counter
                  (cm, thread_index, lbi0, 1,
                   vlib_buffer_length_in_chain (vm, b0));
          }
          if (MPLS_IS_REPLICATE & lbi1)
          {
              next1 = mpls_lookup_to_replicate_edge;
              vnet_buffer (b1)->ip.adj_index[VLIB_TX] =
                  (lbi1 & ~MPLS_IS_REPLICATE);
          }
          else
          {
              lb1 = load_balance_get(lbi1);
              ASSERT (lb1->lb_n_buckets > 0);
              ASSERT (is_pow2 (lb1->lb_n_buckets));

              if (PREDICT_FALSE(lb1->lb_n_buckets > 1))
              {
                  hash_c1 = vnet_buffer (b1)->ip.flow_hash =
                      mpls_compute_flow_hash(h1, lb1->lb_hash_config);
                  dpo1 = load_balance_get_fwd_bucket
                      (lb1,
                       (hash_c1 & (lb1->lb_n_buckets_minus_1)));
              }
              else
              {
                  dpo1 = load_balance_get_bucket_i (lb1, 0);
              }
              next1 = dpo1->dpoi_next_node;

              vnet_buffer (b1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index;

              vlib_increment_combined_counter
                  (cm, thread_index, lbi1, 1,
                   vlib_buffer_length_in_chain (vm, b1));
          }
          if (MPLS_IS_REPLICATE & lbi2)
          {
              next2 = mpls_lookup_to_replicate_edge;
              vnet_buffer (b2)->ip.adj_index[VLIB_TX] =
                  (lbi2 & ~MPLS_IS_REPLICATE);
          }
          else
          {
              lb2 = load_balance_get(lbi2);
              ASSERT (lb2->lb_n_buckets > 0);
              ASSERT (is_pow2 (lb2->lb_n_buckets));

              if (PREDICT_FALSE(lb2->lb_n_buckets > 1))
              {
                  hash_c2 = vnet_buffer (b2)->ip.flow_hash =
                      mpls_compute_flow_hash(h2, lb2->lb_hash_config);
                  dpo2 = load_balance_get_fwd_bucket
                      (lb2,
                       (hash_c2 & (lb2->lb_n_buckets_minus_1)));
              }
              else
              {
                  dpo2 = load_balance_get_bucket_i (lb2, 0);
              }
              next2 = dpo2->dpoi_next_node;

              vnet_buffer (b2)->ip.adj_index[VLIB_TX] = dpo2->dpoi_index;

              vlib_increment_combined_counter
                  (cm, thread_index, lbi2, 1,
                   vlib_buffer_length_in_chain (vm, b2));
          }
          if (MPLS_IS_REPLICATE & lbi3)
          {
              next3 = mpls_lookup_to_replicate_edge;
              vnet_buffer (b3)->ip.adj_index[VLIB_TX] =
                  (lbi3 & ~MPLS_IS_REPLICATE);
          }
          else
          {
              lb3 = load_balance_get(lbi3);
              ASSERT (lb3->lb_n_buckets > 0);
              ASSERT (is_pow2 (lb3->lb_n_buckets));

              if (PREDICT_FALSE(lb3->lb_n_buckets > 1))
              {
                  hash_c3 = vnet_buffer (b3)->ip.flow_hash =
                      mpls_compute_flow_hash(h3, lb3->lb_hash_config);
                  dpo3 = load_balance_get_fwd_bucket
                      (lb3,
                       (hash_c3 & (lb3->lb_n_buckets_minus_1)));
              }
              else
              {
                  dpo3 = load_balance_get_bucket_i (lb3, 0);
              }
              next3 = dpo3->dpoi_next_node;

              vnet_buffer (b3)->ip.adj_index[VLIB_TX] = dpo3->dpoi_index;

              vlib_increment_combined_counter
                  (cm, thread_index, lbi3, 1,
                   vlib_buffer_length_in_chain (vm, b3));
          }

          /*
           * before we pop the label copy th values we need to maintain.
           * The label header is in network byte order.
           *  last byte is the TTL.
           *  bits 2 to 4 inclusive are the EXP bits
           */
          vnet_buffer (b0)->mpls.ttl = ((char*)h0)[3];
          vnet_buffer (b0)->mpls.exp = (((char*)h0)[2] & 0xe) >> 1;
          vnet_buffer (b0)->mpls.first = 1;
          vnet_buffer (b1)->mpls.ttl = ((char*)h1)[3];
          vnet_buffer (b1)->mpls.exp = (((char*)h1)[2] & 0xe) >> 1;
          vnet_buffer (b1)->mpls.first = 1;
          vnet_buffer (b2)->mpls.ttl = ((char*)h2)[3];
          vnet_buffer (b2)->mpls.exp = (((char*)h2)[2] & 0xe) >> 1;
          vnet_buffer (b2)->mpls.first = 1;
          vnet_buffer (b3)->mpls.ttl = ((char*)h3)[3];
          vnet_buffer (b3)->mpls.exp = (((char*)h3)[2] & 0xe) >> 1;
          vnet_buffer (b3)->mpls.first = 1;

          /*
           * pop the label that was just used in the lookup
           */
          vlib_buffer_advance(b0, sizeof(*h0));
          vlib_buffer_advance(b1, sizeof(*h1));
          vlib_buffer_advance(b2, sizeof(*h2));
          vlib_buffer_advance(b3, sizeof(*h3));

          if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
          {
              mpls_lookup_trace_t *tr = vlib_add_trace (vm, node,
                                                        b0, sizeof (*tr));
              tr->next_index = next0;
              tr->lb_index = lbi0;
              tr->lfib_index = lfib_index0;
              tr->hash = hash_c0;
              tr->label_net_byte_order = h0->label_exp_s_ttl;
          }

          if (PREDICT_FALSE(b1->flags & VLIB_BUFFER_IS_TRACED))
          {
              mpls_lookup_trace_t *tr = vlib_add_trace (vm, node,
                                                        b1, sizeof (*tr));
              tr->next_index = next1;
              tr->lb_index = lbi1;
              tr->lfib_index = lfib_index1;
              tr->hash = hash_c1;
              tr->label_net_byte_order = h1->label_exp_s_ttl;
          }

          if (PREDICT_FALSE(b2->flags & VLIB_BUFFER_IS_TRACED))
          {
              mpls_lookup_trace_t *tr = vlib_add_trace (vm, node,
                                                        b2, sizeof (*tr));
              tr->next_index = next2;
              tr->lb_index = lbi2;
              tr->lfib_index = lfib_index2;
              tr->hash = hash_c2;
              tr->label_net_byte_order = h2->label_exp_s_ttl;
          }

          if (PREDICT_FALSE(b3->flags & VLIB_BUFFER_IS_TRACED))
          {
              mpls_lookup_trace_t *tr = vlib_add_trace (vm, node,
                                                        b3, sizeof (*tr));
              tr->next_index = next3;
              tr->lb_index = lbi3;
              tr->lfib_index = lfib_index3;
              tr->hash = hash_c3;
              tr->label_net_byte_order = h3->label_exp_s_ttl;
          }

          vlib_validate_buffer_enqueue_x4 (vm, node, next_index,
                                           to_next, n_left_to_next,
                                           bi0, bi1, bi2, bi3,
                                           next0, next1, next2, next3);
        }

      while (n_left_from > 0 && n_left_to_next > 0)
      {
          u32 lbi0, next0, lfib_index0, bi0, hash_c0;
          const mpls_unicast_header_t * h0;
          const load_balance_t *lb0;
          const dpo_id_t *dpo0;
          vlib_buffer_t * b0;

          bi0 = from[0];
          to_next[0] = bi0;
          from += 1;
          to_next += 1;
          n_left_from -= 1;
          n_left_to_next -= 1;

          b0 = vlib_get_buffer (vm, bi0);
          h0 = vlib_buffer_get_current (b0);

          lfib_index0 = vec_elt(mm->fib_index_by_sw_if_index,
                                vnet_buffer(b0)->sw_if_index[VLIB_RX]);

          lbi0 = mpls_fib_table_forwarding_lookup(lfib_index0, h0);
          hash_c0 = vnet_buffer(b0)->ip.flow_hash = 0;

          if (MPLS_IS_REPLICATE & lbi0)
          {
              next0 = mpls_lookup_to_replicate_edge;
              vnet_buffer (b0)->ip.adj_index[VLIB_TX] =
                  (lbi0 & ~MPLS_IS_REPLICATE);
          }
          else
          {
              lb0 = load_balance_get(lbi0);
              ASSERT (lb0->lb_n_buckets > 0);
              ASSERT (is_pow2 (lb0->lb_n_buckets));

              if (PREDICT_FALSE(lb0->lb_n_buckets > 1))
              {
                  hash_c0 = vnet_buffer (b0)->ip.flow_hash =
                      mpls_compute_flow_hash(h0, lb0->lb_hash_config);
                  dpo0 = load_balance_get_fwd_bucket
                      (lb0,
                       (hash_c0 & (lb0->lb_n_buckets_minus_1)));
              }
              else
              {
                  dpo0 = load_balance_get_bucket_i (lb0, 0);
              }
              next0 = dpo0->dpoi_next_node;
              vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;

              vlib_increment_combined_counter
                  (cm, thread_index, lbi0, 1,
                   vlib_buffer_length_in_chain (vm, b0));
          }

          /*
           * before we pop the label copy, values we need to maintain.
           * The label header is in network byte order.
           *  last byte is the TTL.
           *  bits 2 to 4 inclusive are the EXP bits
           */
          vnet_buffer (b0)->mpls.ttl = ((char*)h0)[3];
          vnet_buffer (b0)->mpls.exp = (((char*)h0)[2] & 0xe) >> 1;
          vnet_buffer (b0)->mpls.first = 1;

          /*
           * pop the label that was just used in the lookup
           */
          vlib_buffer_advance(b0, sizeof(*h0));

          if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
          {
              mpls_lookup_trace_t *tr = vlib_add_trace (vm, node,
                                                        b0, sizeof (*tr));
              tr->next_index = next0;
              tr->lb_index = lbi0;
              tr->lfib_index = lfib_index0;
              tr->hash = hash_c0;
              tr->label_net_byte_order = h0->label_exp_s_ttl;
          }

          vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
                                           to_next, n_left_to_next,
                                           bi0, next0);
        }

      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
    }
  vlib_node_increment_counter (vm, mpls_lookup_node.index,
                               MPLS_ERROR_PKTS_DECAP, from_frame->n_vectors);
  return from_frame->n_vectors;
}

static char * mpls_error_strings[] = {
#define mpls_error(n,s) s,
#include "error.def"
#undef mpls_error
};

VLIB_REGISTER_NODE (mpls_lookup_node, static) = {
  .function = mpls_lookup,
  .name = "mpls-lookup",
  /* Takes a vector of packets. */
  .vector_size = sizeof (u32),
  .n_errors = MPLS_N_ERROR,
  .error_strings = mpls_error_strings,

  .sibling_of = "mpls-load-balance",

  .format_buffer = format_mpls_header,
  .format_trace = format_mpls_lookup_trace,
  .unformat_buffer = unformat_mpls_header,
};

VLIB_NODE_FUNCTION_MULTIARCH (mpls_lookup_node, mpls_lookup)

typedef struct {
  u32 next_index;
  u32 lb_index;
  u32 hash;
} mpls_load_balance_trace_t;

static u8 *
format_mpls_load_balance_trace (u8 * s, va_list * args)
{
  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
  mpls_load_balance_trace_t * t = va_arg (*args, mpls_load_balance_trace_t *);

  s = format (s, "MPLS: next [%d], LB index %d hash %d",
              t->next_index, t->lb_index, t->hash);
  return s;
}

static uword
mpls_load_balance (vlib_main_t * vm,
                  vlib_node_runtime_t * node,
                  vlib_frame_t * frame)
{
  vlib_combined_counter_main_t * cm = &load_balance_main.lbm_via_counters;
  u32 n_left_from, n_left_to_next, * from, * to_next;
  u32 thread_index = vlib_get_thread_index();
  u32 next;

  from = vlib_frame_vector_args (frame);
  n_left_from = frame->n_vectors;
  next = node->cached_next_index;

  while (n_left_from > 0)
    {
      vlib_get_next_frame (vm, node, next,
                           to_next, n_left_to_next);


      while (n_left_from >= 4 && n_left_to_next >= 2)
        {
          const load_balance_t *lb0, *lb1;
          vlib_buffer_t * p0, *p1;
          u32 pi0, lbi0, hc0, pi1, lbi1, hc1, next0, next1;
          const mpls_unicast_header_t *mpls0, *mpls1;
          const dpo_id_t *dpo0, *dpo1;

          /* Prefetch next iteration. */
          {
            vlib_buffer_t * p2, * p3;

            p2 = vlib_get_buffer (vm, from[2]);
            p3 = vlib_get_buffer (vm, from[3]);

            vlib_prefetch_buffer_header (p2, STORE);
            vlib_prefetch_buffer_header (p3, STORE);

            CLIB_PREFETCH (p2->data, sizeof (mpls0[0]), LOAD);
            CLIB_PREFETCH (p3->data, sizeof (mpls0[0]), LOAD);
          }

          pi0 = to_next[0] = from[0];
          pi1 = to_next[1] = from[1];

          from += 2;
          n_left_from -= 2;
          to_next += 2;
          n_left_to_next -= 2;

          p0 = vlib_get_buffer (vm, pi0);
          p1 = vlib_get_buffer (vm, pi1);

          mpls0 = vlib_buffer_get_current (p0);
          mpls1 = vlib_buffer_get_current (p1);
          lbi0 = vnet_buffer (p0)->ip.adj_index[VLIB_TX];
          lbi1 = vnet_buffer (p1)->ip.adj_index[VLIB_TX];

          lb0 = load_balance_get(lbi0);
          lb1 = load_balance_get(lbi1);

          /*
           * this node is for via FIBs we can re-use the hash value from the
           * to node if present.
           * We don't want to use the same hash value at each level in the recursion
           * graph as that would lead to polarisation
           */
          hc0 = vnet_buffer (p0)->ip.flow_hash = 0;
          hc1 = vnet_buffer (p1)->ip.flow_hash = 0;

          if (PREDICT_FALSE (lb0->lb_n_buckets > 1))
          {
              if (PREDICT_TRUE (vnet_buffer(p0)->ip.flow_hash))
              {
                  hc0 = vnet_buffer(p0)->ip.flow_hash = vnet_buffer(p0)->ip.flow_hash >> 1;
              }
              else
              {
                  hc0 = vnet_buffer(p0)->ip.flow_hash = mpls_compute_flow_hash(mpls0, hc0);
              }
              dpo0 = load_balance_get_fwd_bucket(lb0, (hc0 & lb0->lb_n_buckets_minus_1));
          }
          else
          {
              dpo0 = load_balance_get_bucket_i (lb0, 0);
          }
          if (PREDICT_FALSE (lb1->lb_n_buckets > 1))
          {
              if (PREDICT_TRUE (vnet_buffer(p1)->ip.flow_hash))
              {
                  hc1 = vnet_buffer(p1)->ip.flow_hash = vnet_buffer(p1)->ip.flow_hash >> 1;
              }
              else
              {
                  hc1 = vnet_buffer(p1)->ip.flow_hash = mpls_compute_flow_hash(mpls1, hc1);
              }
              dpo1 = load_balance_get_fwd_bucket(lb1, (hc1 & lb1->lb_n_buckets_minus_1));
          }
          else
          {
              dpo1 = load_balance_get_bucket_i (lb1, 0);
          }

          next0 = dpo0->dpoi_next_node;