/*
* sr_steering.c: ipv6 segment routing steering into SR policy
*
* Copyright (c) 2016 Cisco and/or its affiliates.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* @file
* @brief Packet steering into SR Policies
*
* This file is in charge of handling the FIB appropiatly to steer packets
* through SR Policies as defined in 'sr_policy_rewrite.c'. Notice that here
* we are only doing steering. SR policy application is done in
* sr_policy_rewrite.c
*
* Supports:
* - Steering of IPv6 traffic Destination Address based
* - Steering of IPv4 traffic Destination Address based
* - Steering of L2 frames, interface based (sw interface)
*/
#include <vlib/vlib.h>
#include <vnet/vnet.h>
#include <vnet/srv6/sr.h>
#include <vnet/ip/ip.h>
#include <vnet/srv6/sr_packet.h>
#include <vnet/ip/ip6_packet.h>
#include <vnet/fib/ip6_fib.h>
#include <vnet/dpo/dpo.h>
#include <vppinfra/error.h>
#include <vppinfra/elog.h>
/**
* @brief Steer traffic L2 and L3 traffic through a given SR policy
*
* @param is_del
* @param bsid is the bindingSID of the SR Policy (alt to sr_policy_index)
* @param sr_policy is the index of the SR Policy (alt to bsid)
* @param table_id is the VRF where to install the FIB entry for the BSID
* @param prefix is the IPv4/v6 address for L3 traffic type
* @param mask_width is the mask for L3 traffic type
* @param sw_if_index is the incoming interface for L2 traffic
* @param traffic_type describes the type of traffic
*
* @return 0 if correct, else error
*/
int
sr_steering_policy (int is_del, ip6_address_t * bsid, u32 sr_policy_index,
u32 table_id, ip46_address_t * prefix, u32 mask_width,
u32 sw_if_index, u8 traffic_type)
{
ip6_sr_main_t *sm = &sr_main;
sr_steering_key_t key;
ip6_sr_steering_policy_t *steer_pl;
fib_prefix_t pfx = { 0 };
ip6_sr_policy_t *sr_policy = 0;
uword *p = 0;
clib_memset (&key, 0, sizeof (sr_steering_key_t));
/* Compute the steer policy key */
if (traffic_type == SR_STEER_IPV4 || traffic_type == SR_STEER_IPV6)
{
key.l3.prefix.as_u64[0] = prefix->as_u64[0];
key.l3.prefix.as_u64[1] = prefix->as_u64[1];
key.l3.mask_width = mask_width;
key.l3.fib_table = (table_id != (u32) ~ 0 ? table_id : 0);
}
else if (traffic_type == SR_STEER_L2)
{
key.l2.sw_if_index = sw_if_index;
/* Sanitise the SW_IF_INDEX */
if (pool_is_free_index (sm->vnet_main->interface_main.sw_interfaces,
sw_if_index))
return -3;
vnet_sw_interface_t *sw =
vnet_get_sw_interface (sm->vnet_main, sw_if_index);
if (sw->type != VNET_SW_INTERFACE_TYPE_HARDWARE)
return -3;
}
else
return -1;
key.traffic_type = traffic_type;
/* Search for the item */
p = mhash_get (&sm->sr_steer_policies_hash, &key);
if (p)
{
/* Retrieve Steer Policy function */
steer_pl = pool_elt_at_index (sm->steer_policies, p[0]);
if (is_del)
{
if (steer_pl->classify.traffic_type == SR_STEER_IPV6)
{
/* Remove FIB entry */
pfx.fp_proto = FIB_PROTOCOL_IP6;
pfx.fp_len = steer_pl->classify.l3.mask_width;
pfx.fp_addr.ip6 = steer_pl->classify.l3.prefix.ip6;
fib_table_entry_delete (fib_table_find
(FIB_PROTOCOL_IP6,
steer_pl->classify.l3.fib_table),
&pfx, FIB_SOURCE_SR);
}
else if (steer_pl->classify.traffic_type == SR_STEER_IPV4)
{
/* Remove FIB entry */
pfx.fp_proto = FIB_PROTOCOL_IP4;
pfx.fp_len = steer_pl->classify.l3.mask_width;
pfx.fp_addr.ip4 = steer_pl->classify.l3.prefix.ip4;
fib_table_entry_delete (fib_table_find
(FIB_PROTOCOL_IP4,
steer_pl->classify.l3.fib_table), &pfx,
FIB_SOURCE_SR);
}
else if (steer_pl->classify.traffic_type == SR_STEER_L2)
{
/* Remove HW redirection */
int ret = vnet_feature_enable_disable ("device-input",
"sr-pl-rewrite-encaps-l2",
sw_if_index, 0, 0, 0);
if (ret != 0)
return -1;
sm->sw_iface_sr_policies[sw_if_index] = ~(u32) 0;
/* Remove promiscous mode from interface */
vnet_main_t *vnm = vnet_get_main ();
ethernet_main_t *em = ðernet_main;
ethernet_interface_t *eif =
ethernet_get_interface (em, sw_if_index);
if (!eif)
goto cleanup_error_redirection;
ethernet_set_flags (vnm, sw_if_index, 0);
}
/* Delete SR steering policy entry */
pool_put (sm->steer_policies, steer_pl);
mhash_unset (&sm->sr_steer_policies_hash, &key, NULL);
/* If no more SR policies or steering policies */
if (!pool_elts (sm->sr_policies) && !pool_elts (sm->steer_policies))
{
fib_table_unlock (sm->fib_table_ip6,
FIB_PROTOCOL_IP6, FIB_SOURCE_SR);
fib_table_unlock (sm->fib_table_ip4,
FIB_PROTOCOL_IP6, FIB_SOURCE_SR);
sm->fib_table_ip6 = (u32) ~ 0;
sm->fib_table_ip4 = (u32) ~ 0;
}
return 0;
}
else /* It means user requested to update an existing SR steering policy */
{
/* Retrieve SR steering policy */
if (bsid)
{
p = mhash_get (&sm->sr_policies_index_hash, bsid);
if (p)
sr_policy = pool_elt_at_index (sm->sr_policies, p[0]);
else
return -2;
}
else
sr_policy = pool_elt_at_index (sm->sr_policies, sr_policy_index);
if (!sr_policy)
return -2;
steer_pl->sr_policy = sr_policy - sm->sr_policies;
/* Remove old FIB/hw redirection and create a new one */
if (steer_pl->classify.traffic_type == SR_STEER_IPV6)
{
/* Remove FIB entry */
pfx.fp_proto = FIB_PROTOCOL_IP6;
pfx.fp_len = steer_pl->classify.l3.mask_width;
pfx.fp_addr.ip6 = steer_pl->classify.l3.prefix.ip6;
fib_table_entry_delete (fib_table_find
(FIB_PROTOCOL_IP6,
steer_pl->classify.l3.fib_table),
&pfx, FIB_SOURCE_SR);
/* Create a new one */
goto update_fib;
}
else if (steer_pl->classify.traffic_type == SR_STEER_IPV4)
{
/* Remove FIB entry */
pfx.fp_proto = FIB_PROTOCOL_IP4;
pfx.fp_len = steer_pl->classify.l3.mask_width;
pfx.fp_addr.ip4 = steer_pl->classify.l3.prefix.ip4;
fib_table_entry_delete (fib_table_find
(FIB_PROTOCOL_IP4,
steer_pl->classify.l3.fib_table),
&pfx, FIB_SOURCE_SR);
/* Create a new one */
goto update_fib;
}
else if (steer_pl->classify.traffic_type == SR_STEER_L2)
{
/* Update L2-HW redirection */
goto update_fib;
}
}
}
else
/* delete; steering policy does not exist; complain */
if (is_del)
return -4;
/* Retrieve SR policy */
if (bsid)
{
p = mhash_get (&sm->sr_policies_index_hash, bsid);
if (p)
sr_policy = pool_elt_at_index (sm->sr_policies, p[0]);
else
return -2;
}
else
sr_policy = pool_elt_at_index (sm->sr_policies, sr_policy_index);
/* Create a new steering policy */
pool_get (sm->steer_policies, steer_pl);
clib_memset (steer_pl, 0, sizeof (*steer_pl));
if (traffic_type == SR_STEER_IPV4 || traffic_type == SR_STEER_IPV6)
{
clib_memcpy_fast (&steer_pl->classify.l3.prefix, prefix,
sizeof (ip46_address_t));
steer_pl->classify.l3.mask_width = mask_width;
steer_pl->classify.l3.fib_table =
(table_id != (u32) ~ 0 ? table_id : 0);
steer_pl->classify.traffic_type = traffic_type;
}
else if (traffic_type == SR_STEER_L2)
{
steer_pl->classify.l2.sw_if_index = sw_if_index;
steer_pl->classify.traffic_type = traffic_type;
}
else
{
/* Incorrect API usage. Should never get here */
pool_put (sm->steer_policies, steer_pl);
mhash_unset (&sm->sr_steer_policies_hash, &key, NULL);
return -1;
}
steer_pl->sr_policy = sr_policy - sm->sr_policies;
/* Create and store key */
mhash_set (&sm->sr_steer_policies_hash, &key, steer_pl - sm->steer_policies,
NULL);
if (traffic_type == SR_STEER_L2)
{
if (!sr_policy->is_encap)
goto cleanup_error_encap;
if (vnet_feature_enable_disable
("device-input", "sr-pl-rewrite-encaps-l2", sw_if_index, 1, 0, 0))
goto cleanup_error_redirection;
/* Set promiscous mode on interface */
vnet_main_t *vnm = vnet_get_main ();
ethernet_main_t *em = ðernet_main;
ethernet_interface_t *eif = ethernet_get_interface (em, sw_if_index);
if (!eif)
goto cleanup_error_redirection;
ethernet_set_flags (vnm, sw_if_index,
ETHERNET_INTERFACE_FLAG_ACCEPT_ALL);
}
else if (traffic_type == SR_STEER_IPV4)
if (!sr_policy->is_encap)
goto cleanup_error_encap;
update_fib:
/* FIB API calls - Recursive route through the BindingSID */
if (traffic_type == SR_STEER_IPV6)
{
pfx.fp_proto = FIB_PROTOCOL_IP6;
pfx.fp_len = steer_pl->classify.l3.mask_width;
pfx.fp_addr.ip6 = steer_pl->classify.l3.prefix.ip6;
fib_table_entry_path_add (fib_table_find (FIB_PROTOCOL_IP6,
(table_id !=
(u32) ~ 0 ?
table_id : 0)),
&pfx, FIB_SOURCE_SR,
FIB_ENTRY_FLAG_LOOSE_URPF_EXEMPT,
DPO_PROTO_IP6,
(ip46_address_t *) & sr_policy->bsid, ~0,
sm->fib_table_ip6, 1, NULL,
FIB_ROUTE_PATH_FLAG_NONE);
}
else if (traffic_type == SR_STEER_IPV4)
{
pfx.fp_proto = FIB_PROTOCOL_IP4;
pfx.fp_len = steer_pl->classify.l3.mask_width;
pfx.fp_addr.ip4 = steer_pl->classify.l3.prefix.ip4;
fib_table_entry_path_add (fib_table_find (FIB_PROTOCOL_IP4,
(table_id !=
(u32) ~ 0 ?
table_id : 0)),
&pfx, FIB_SOURCE_SR,
FIB_ENTRY_FLAG_LOOSE_URPF_EXEMPT,
DPO_PROTO_IP6,
(ip46_address_t *) & sr_policy->bsid, ~0,
sm->fib_table_ip4, 1, NULL,
FIB_ROUTE_PATH_FLAG_NONE);
}
else if (traffic_type == SR_STEER_L2)
{
if (sw_if_index < vec_len (sm->sw_iface_sr_policies))
sm->sw_iface_sr_policies[sw_if_index] = steer_pl->sr_policy;
else
{
vec_resize (sm->sw_iface_sr_policies,
(pool_len (sm->vnet_main->interface_main.sw_interfaces)
- vec_len (sm->sw_iface_sr_policies)));
sm->sw_iface_sr_policies[sw_if_index] = steer_pl->sr_policy;
}
}
return 0;
cleanup_error_encap:
pool_put (sm->steer_policies, steer_pl);
mhash_unset (&sm->sr_steer_policies_hash, &key, NULL);
return -5;
cleanup_error_redirection:
pool_put (sm->steer_policies, steer_pl);
mhash_unset (&sm->sr_steer_policies_hash, &key, NULL);
return -3;
}
static clib_error_t *
sr_steer_policy_command_fn (vlib_main_t * vm, unformat_input_t * input,
vlib_cli_command_t * cmd)
{
vnet_main_t *vnm = vnet_get_main ();
int is_del = 0;
ip46_address_t prefix;
u32 dst_mask_width = 0;
u32 sw_if_index = (u32) ~ 0;
u8 traffic_type = 0;
u32 fib_table = (u32) ~ 0;
ip6_address_t bsid;
u32 sr_policy_index = (u32) ~ 0;
u8 sr_policy_set = 0;
clib_memset (&prefix, 0, sizeof (ip46_address_t));
int rv;
while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
{
if (unformat (input, "del"))
is_del = 1;
else if (!traffic_type
&& unformat (input, "l3 %U/%d", unformat_ip6_address,
&prefix.ip6, &dst_mask_width))
traffic_type = SR_STEER_IPV6;
else if (!traffic_type
&& unformat (input, "l3 %U/%d", unformat_ip4_address,
&prefix.ip4, &dst_mask_width))
traffic_type = SR_STEER_IPV4;
else if (!traffic_type
&& unformat (input, "l2 %U", unformat_vnet_sw_interface, vnm,
&sw_if_index))
traffic_type = SR_STEER_L2;
else if (!sr_policy_set
&& unformat (input, "via index %d", &sr_policy_index))
sr_policy_set = 1;
else if (!sr_policy_set
&& unformat (input, "via bsid %U",
unformat_ip6_address, &bsid))
sr_policy_set = 1;
else if (fib_table == (u32) ~ 0
&& unformat (input, "fib-table %d", &fib_table));
else
break;
}
if (!traffic_type)
return clib_error_return (0, "No L2/L3 traffic specified");
if (!sr_policy_set)
return clib_error_return (0, "No SR policy specified");
/* Make sure that the prefixes are clean */
if (traffic_type == SR_STEER_IPV4)
{
u32 mask =
(dst_mask_width ? (0xFFFFFFFFu >> (32 - dst_mask_width)) : 0);
prefix.ip4.as_u32 &= mask;
}
else if (traffic_type == SR_STEER_IPV6)
{
ip6_address_t mask;
ip6_address_mask_from_width (&mask, dst_mask_width);
ip6_address_mask (&prefix.ip6, &mask);
}
rv =
sr_steering_policy (is_del, (sr_policy_index == ~(u32) 0 ? &bsid : NULL),
sr_policy_index, fib_table, &prefix, dst_mask_width,
sw_if_index, traffic_type);
switch (rv)
{
case 0:
break;
case 1:
return 0;
case -1:
return clib_error_return (0, "Incorrect API usage.");
case -
@media only all and (prefers-color-scheme: dark) {
.highlight .hll { background-color: #49483e }
.highlight .c { color: #75715e } /* Comment */
.highlight .err { color: #960050; background-color: #1e0010 } /* Error */
.highlight .k { color: #66d9ef } /* Keyword */
.highlight .l { color: #ae81ff } /* Literal */
.highlight .n { color: #f8f8f2 } /* Name */
.highlight .o { color: #f92672 } /* Operator */
.highlight .p { color: #f8f8f2 } /* Punctuation */
.highlight .ch { color: #75715e } /* Comment.Hashbang */
.highlight .cm { color: #75715e } /* Comment.Multiline */
.highlight .cp { color: #75715e } /* Comment.Preproc */
.highlight .cpf { color: #75715e } /* Comment.PreprocFile */
.highlight .c1 { color: #75715e } /* Comment.Single */
.highlight .cs { color: #75715e } /* Comment.Special */
.highlight .gd { color: #f92672 } /* Generic.Deleted */
.highlight .ge { font-style: italic } /* Generic.Emph */
.highlight .gi { color: #a6e22e } /* Generic.Inserted */
.highlight .gs { font-weight: bold } /* Generic.Strong */
.highlight .gu { color: #75715e } /* Generic.Subheading */
.highlight .kc { color: #66d9ef } /* Keyword.Constant */
.highlight .kd { color: #66d9ef } /* Keyword.Declaration */
.highlight .kn { color: #f92672 } /* Keyword.Namespace */
.highlight .kp { color: #66d9ef } /* Keyword.Pseudo */
.highlight .kr { color: #66d9ef } /* Keyword.Reserved */
.highlight .kt { color: #66d9ef } /* Keyword.Type */
.highlight .ld { color: #e6db74 } /* Literal.Date */
.highlight .m { color: #ae81ff } /* Literal.Number */
.highlight .s { color: #e6db74 } /* Literal.String */
.highlight .na { color: #a6e22e } /* Name.Attribute */
.highlight .nb { color: #f8f8f2 } /* Name.Builtin */
.highlight .nc { color: #a6e22e } /* Name.Class */
.highlight .no { color: #66d9ef } /* Name.Constant */
.highlight .nd { color: #a6e22e } /* Name.Decorator */
.highlight .ni { color: #f8f8f2 } /* Name.Entity */
.highlight .ne { color: #a6e22e } /* Name.Exception */
.highlight .nf { color: #a6e22e } /* Name.Function */
.highlight .nl { color: #f8f8f2 } /* Name.Label */
.highlight .nn { color: #f8f8f2 } /* Name.Namespace */
.highlight .nx { color: #a6e22e } /* Name.Other */
.highlight .py { color: #f8f8f2 } /* Name.Property */
.highlight .nt { color: #f92672 } /* Name.Tag */
.highlight .nv { color: #f8f8f2 } /* Name.Variable */
.highlight .ow { color: #f92672 } /* Operator.Word */
.highlight .w { color: #f8f8f2 } /* Text.Whitespace */
.highlight .mb { color: #ae81ff } /* Literal.Number.Bin */
.highlight .mf { color: #ae81ff } /* Literal.Number.Float */
.highlight .mh { color: #ae81ff } /* Literal.Number.Hex */
.highlight .mi { color: #ae81ff } /* Literal.Number.Integer */
.highlight .mo { color: #ae81ff } /* Literal.Number.Oct */
.highlight .sa { color: #e6db74 } /* Literal.String.Affix */
.highlight .sb { color: #e6db74 } /* Literal.String.Backtick */
.highlight .sc { color: #e6db74 } /* Literal.String.Char */
.highlight .dl { color: #e6db74 } /* Literal.String.Delimiter */
.highlight .sd { color: #e6db74 } /* Literal.String.Doc */
.highlight .s2 { color: #e6db74 } /* Literal.String.Double */
.highlight .se { color: #ae81ff } /* Literal.String.Escape */
.highlight .sh { color: #e6db74 } /* Literal.String.Heredoc */
.highlight .si { color: #e6db74 } /* Literal.String.Interpol */
.highlight .sx { color: #e6db74 } /* Literal.String.Other */
.highlight .sr { color: #e6db74 } /* Literal.String.Regex */
.highlight .s1 { color: #e6db74 } /* Literal.String.Single */
.highlight .ss { color: #e6db74 } /* Literal.String.Symbol */
.highlight .bp { color: #f8f8f2 } /* Name.Builtin.Pseudo */
.highlight .fm { color: #a6e22e } /* Name.Function.Magic */
.highlight .vc { color: #f8f8f2 } /* Name.Variable.Class */
.highlight .vg { color: #f8f8f2 } /* Name.Variable.Global */
.highlight .vi { color: #f8f8f2 } /* Name.Variable.Instance */
.highlight .vm { color: #f8f8f2 } /* Name.Variable.Magic */
.highlight .il { color: #ae81ff } /* Literal.Number.Integer.Long */
}
@media (prefers-color-scheme: light) {
.highlight .hll { background-color: #ffffcc }
.highlight .c { color: #888888 } /* Comment */
.highlight .err { color: #a61717; background-color: #e3d2d2 } /* Error */
.highlight .k { color: #008800; font-weight: bold } /* Keyword */
.highlight .ch { color: #888888 } /* Comment.Hashbang */
.highlight .cm { color: #888888 } /* Comment.Multiline */
.highlight .cp { color: #cc0000; font-weight: bold } /* Comment.Preproc */
.highlight .cpf { color: #888888 } /* Comment.PreprocFile */
.highlight .c1 { color: #888888 } /* Comment.Single */
.highlight .cs { color: #cc0000; font-weight: bold; background-color: #fff0f0 } /* Comment.Special */
.highlight .gd { color: #000000; background-color: #ffdddd } /* Generic.Deleted */
.highlight .ge { font-style: italic } /* Generic.Emph */
.highlight .gr { color: #aa0000 } /* Generic.Error */
.highlight .gh { color: #333333 } /* Generic.Heading */
.highlight .gi { color: #000000; background-color: #ddffdd } /* Generic.Inserted */
.highlight .go { color: #888888 } /* Generic.Output */
.highlight .gp { color: #555555 } /* Generic.Prompt */
.highlight .gs { font-weight: bold } /* Generic.Strong */
.highlight .gu { color: #666666 } /* Generic.Subheading */
.highlight .gt { color: #aa0000 } /* Generic.Traceback */
.highlight .kc { color: #008800; font-weight: bold } /* Keyword.Constant */
.highlight .kd { color: #008800; font-weight: bold } /* Keyword.Declaration */
.highlight .kn { color: #008800; font-weight: bold } /* Keyword.Namespace */
.highlight .kp { color: #008800 } /* Keyword.Pseudo */
.highlight .kr { color: #008800; font-weight: bold } /* Keyword.Reserved */
.highlight .kt { color: #888888; font-weight: bold } /* Keyword.Type */
.highlight .m { color: #0000DD; font-weight: bold } /* Literal.Number */
.highlight .s { color: #dd2200; background-color: #fff0f0 } /* Literal.String */
.highlight .na { color: #336699 } /* Name.Attribute */
.highlight .nb { color: #003388 } /* Name.Builtin */
.highlight .nc { color: #bb0066; font-weight: bold } /* Name.Class */
.highlight .no { color: #003366; font-weight: bold } /* Name.Constant */
.highlight .nd { color: #555555 } /* Name.Decorator */
.highlight .ne { color: #bb0066; font-weight: bold } /* Name.Exception */
.highlight .nf { color: #0066bb; font-weight: bold } /* Name.Function */
.highlight .nl { color: #336699; font-style: italic } /* Name.Label */
.highlight .nn { color: #bb0066; font-weight: bold } /* Name.Namespace */
.highlight .py { color: #336699; font-weight: bold } /* Name.Property */
.highlight .nt { color: #bb0066; font-weight: bold } /* Name.Tag */
.highlight .nv { color: #336699 } /* Name.Variable */
.highlight .ow { color: #008800 } /* Operator.Word */
.highlight .w { color: #bbbbbb } /* Text.Whitespace */
.highlight .mb { color: #0000DD; font-weight: bold } /* Literal.Number.Bin */
.highlight .mf { color: #0000DD; font-weight: bold } /* Literal.Number.Float */
.highlight .mh { color: #0000DD; font-weight: bold } /* Literal.Number.Hex */
.highlight .mi { color: #0000DD; font-weight: bold } /* Literal.Number.Integer */
.highlight .mo { color: #0000DD; font-weight: bold } /* Literal.Number.Oct */
.highlight .sa { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Affix */
.highlight .sb { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Backtick */
.highlight .sc { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Char */
.highlight .dl { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Delimiter */
.highlight .sd { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Doc */
.highlight .s2 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Double */
.highlight .se { color: #0044dd; background-color: #fff0f0 } /* Literal.String.Escape */
.highlight .sh { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Heredoc */
.highlight .si { color: #3333bb; background-color: #fff0f0 } /* Literal.String.Interpol */
.highlight .sx { color: #22bb22; background-color: #f0fff0 } /* Literal.String.Other */
.highlight .sr { color: #008800; background-color: #fff0ff } /* Literal.String.Regex */
.highlight .s1 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Single */
.highlight .ss { color: #aa6600; background-color: #fff0f0 } /* Literal.String.Symbol */
.highlight .bp { color: #003388 } /* Name.Builtin.Pseudo */
.highlight .fm { color: #0066bb; font-weight: bold } /* Name.Function.Magic */
.highlight .vc { color: #336699 } /* Name.Variable.Class */
.highlight .vg { color: #dd7700 } /* Name.Variable.Global */
.highlight .vi { color: #3333bb } /* Name.Variable.Instance */
.highlight .vm { color: #336699 } /* Name.Variable.Magic */
.highlight .il { color: #0000DD; font-weight: bold } /* Literal.Number.Integer.Long */
}
/*
* Copyright (c) 2016-2019 Cisco and/or its affiliates.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <unistd.h>
#include <stdio.h>
#include <signal.h>
#include <dlfcn.h>
#include <pthread.h>
#include <time.h>
#include <stdarg.h>
#include <sys/resource.h>
#include <netinet/tcp.h>
#include <vcl/ldp_socket_wrapper.h>
#include <vcl/ldp.h>
#include <sys/time.h>
#include <vcl/vcl_locked.h>
#include <vppinfra/time.h>
#include <vppinfra/bitmap.h>
#include <vppinfra/lock.h>
#include <vppinfra/pool.h>
#include <vppinfra/hash.h>
#define HAVE_CONSTRUCTOR_ATTRIBUTE
#ifdef HAVE_CONSTRUCTOR_ATTRIBUTE
#define CONSTRUCTOR_ATTRIBUTE \
__attribute__ ((constructor))
#else
#define CONSTRUCTOR_ATTRIBUTE
#endif /* HAVE_CONSTRUCTOR_ATTRIBUTE */
#define HAVE_DESTRUCTOR_ATTRIBUTE
#ifdef HAVE_DESTRUCTOR_ATTRIBUTE
#define DESTRUCTOR_ATTRIBUTE \
__attribute__ ((destructor))
#else
#define DESTRUCTOR_ATTRIBUTE
#endif
#define LDP_MAX_NWORKERS 32
typedef struct ldp_worker_ctx_
{
u8 *io_buffer;
clib_time_t clib_time;
/*
* Select state
*/
clib_bitmap_t *rd_bitmap;
clib_bitmap_t *wr_bitmap;
clib_bitmap_t *ex_bitmap;
clib_bitmap_t *si_rd_bitmap;
clib_bitmap_t *si_wr_bitmap;
clib_bitmap_t *si_ex_bitmap;
clib_bitmap_t *libc_rd_bitmap;
clib_bitmap_t *libc_wr_bitmap;
clib_bitmap_t *libc_ex_bitmap;
/*
* Poll state
*/
vcl_poll_t *vcl_poll;
struct pollfd *libc_poll;
u16 *libc_poll_idxs;
/*
* Epoll state
*/
u8 epoll_wait_vcl;
int vcl_mq_epfd;
} ldp_worker_ctx_t;
/* clib_bitmap_t, fd_mask and vcl_si_set are used interchangeably. Make sure
* they are the same size */
STATIC_ASSERT (sizeof (clib_bitmap_t) == sizeof (fd_mask),
"ldp bitmap size mismatch");
STATIC_ASSERT (sizeof (vcl_si_set) == sizeof (fd_mask),
"ldp bitmap size mismatch");
typedef struct
{
ldp_worker_ctx_t *workers;
int init;
char app_name[LDP_APP_NAME_MAX];
u32 vlsh_bit_val;
u32 vlsh_bit_mask;
u32 debug;
/** vcl needs next epoll_create to go to libc_epoll */
u8 vcl_needs_real_epoll;
} ldp_main_t;
#define LDP_DEBUG ldp->debug
#define LDBG(_lvl, _fmt, _args...) \
if (ldp->debug > _lvl) \
clib_warning ("ldp<%d>: " _fmt, getpid(), ##_args)
static ldp_main_t ldp_main = {
.vlsh_bit_val = (1 << LDP_SID_BIT_MIN),
.vlsh_bit_mask = (1 << LDP_SID_BIT_MIN) - 1,
.debug = LDP_DEBUG_INIT,
};
static ldp_main_t *ldp = &ldp_main;
static inline ldp_worker_ctx_t *
ldp_worker_get_current (void)
{
return (ldp->workers + vppcom_worker_index ());
}
/*
* RETURN: 0 on success or -1 on error.
* */
static inline void
ldp_set_app_name (char *app_name)
{
int rv = snprintf (ldp->app_name, LDP_APP_NAME_MAX,
"ldp-%d-%s", getpid (), app_name);
if (rv >= LDP_APP_NAME_MAX)
app_name[LDP_APP_NAME_MAX - 1] = 0;
}
static inline char *
ldp_get_app_name ()
{
if (ldp->app_name[0] == '\0')
ldp_set_app_name ("app");
return ldp->app_name;
}
static inline int
ldp_vlsh_to_fd (vls_handle_t vlsh)
{
return (vlsh + ldp->vlsh_bit_val);
}
static inline vls_handle_t
ldp_fd_to_vlsh (int fd)
{
if (fd < ldp->vlsh_bit_val)
return VLS_INVALID_HANDLE;
return (fd - ldp->vlsh_bit_val);
}
static void
ldp_alloc_workers (void)
{
if (ldp->workers)
return;
pool_alloc (ldp->workers, LDP_MAX_NWORKERS);
}
static inline int
ldp_init (void)
{
ldp_worker_ctx_t *ldpw;
int rv;
if (PREDICT_TRUE (ldp->init))
return 0;
ldp->init = 1;
ldp->vcl_needs_real_epoll = 1;
rv = vls_app_create (ldp_get_app_name ());
if (rv != VPPCOM_OK)
{
ldp->vcl_needs_real_epoll = 0;
if (rv == VPPCOM_EEXIST)
return 0;
LDBG (2, "\nERROR: ldp_init: vppcom_app_create()"
" failed! rv = %d (%s)\n", rv, vppcom_retval_str (rv));
ldp->init = 0;
return rv;
}
ldp->vcl_needs_real_epoll = 0;
ldp_alloc_workers ();
ldpw = ldp_worker_get_current ();
char *env_var_str = getenv (LDP_ENV_DEBUG);
if (env_var_str)
{
u32 tmp;
if (sscanf (env_var_str, "%u", &tmp) != 1)
clib_warning ("LDP<%d>: WARNING: Invalid LDP debug level specified in"
" the env var " LDP_ENV_DEBUG " (%s)!", getpid (),
env_var_str);
else
{
ldp->debug = tmp;
LDBG (0, "configured LDP debug level (%u) from env var "
LDP_ENV_DEBUG "!", ldp->debug);
}
}
env_var_str = getenv (LDP_ENV_APP_NAME);
if (env_var_str)
{
ldp_set_app_name (env_var_str);
LDBG (0, "configured LDP app name (%s) from the env var "
LDP_ENV_APP_NAME "!", ldp->app_name);
}
env_var_str = getenv (LDP_ENV_SID_BIT);
if (env_var_str)
{
u32 sb;
if (sscanf (env_var_str, "%u", &sb) != 1)
{
LDBG (0, "WARNING: Invalid LDP sid bit specified in the env var "
LDP_ENV_SID_BIT " (%s)! sid bit value %d (0x%x)", env_var_str,
ldp->vlsh_bit_val, ldp->vlsh_bit_val);
}
else if (sb < LDP_SID_BIT_MIN)
{
ldp->vlsh_bit_val = (1 << LDP_SID_BIT_MIN);
ldp->vlsh_bit_mask = ldp->vlsh_bit_val - 1;
LDBG (0, "WARNING: LDP sid bit (%u) specified in the env var "
LDP_ENV_SID_BIT " (%s) is too small. Using LDP_SID_BIT_MIN"
" (%d)! sid bit value %d (0x%x)", sb, env_var_str,
LDP_SID_BIT_MIN, ldp->vlsh_bit_val, ldp->vlsh_bit_val);
}
else if (sb > LDP_SID_BIT_MAX)
{
ldp->vlsh_bit_val = (1 << LDP_SID_BIT_MAX);
ldp->vlsh_bit_mask = ldp->vlsh_bit_val - 1;
LDBG (0, "WARNING: LDP sid bit (%u) specified in the env var "
LDP_ENV_SID_BIT " (%s) is too big. Using LDP_SID_BIT_MAX"
" (%d)! sid bit value %d (0x%x)", sb, env_var_str,
LDP_SID_BIT_MAX, ldp->vlsh_bit_val, ldp->vlsh_bit_val);
}
else
{
ldp->vlsh_bit_val = (1 << sb);
ldp->vlsh_bit_mask = ldp->vlsh_bit_val - 1;
LDBG (0, "configured LDP sid bit (%u) from "
LDP_ENV_SID_BIT "! sid bit value %d (0x%x)", sb,
ldp->vlsh_bit_val, ldp->vlsh_bit_val);
}
/* Make sure there are enough bits in the fd set for vcl sessions */
if (ldp->vlsh_bit_val > FD_SETSIZE / 2)
{
LDBG (0, "ERROR: LDP vlsh bit value %d > FD_SETSIZE/2 %d!",
ldp->vlsh_bit_val, FD_SETSIZE / 2);
ldp->init = 0;
return -1;
}
}
/* *INDENT-OFF* */
pool_foreach (ldpw, ldp->workers, ({
clib_memset (&ldpw->clib_time, 0, sizeof (ldpw->clib_time));
}));
/* *INDENT-ON* */
LDBG (0, "LDP initialization: done!");
return 0;
}
int
close (int fd)
{
vls_handle_t vlsh;
int rv, epfd;
if ((errno = -ldp_init ()))
return -1;
vlsh = ldp_fd_to_vlsh (fd);
if (vlsh != VLS_INVALID_HANDLE)
{
epfd = vls_attr (vlsh, VPPCOM_ATTR_GET_LIBC_EPFD, 0, 0);
if (epfd > 0)
{
LDBG (0, "fd %d: calling libc_close: epfd %u", fd, epfd);
rv = libc_close (epfd);
if (rv < 0)
{
u32 size = sizeof (epfd);
epfd = 0;
(void) vls_attr (vlsh, VPPCOM_ATTR_SET_LIBC_EPFD, &epfd, &size);
}
}
else if (PREDICT_FALSE (epfd < 0))
{
errno = -epfd;
rv = -1;
goto done;
}
LDBG (0, "fd %d: calling vls_close: vlsh %u", fd, vlsh);
rv = vls_close (vlsh);
if (rv != VPPCOM_OK)
{
errno = -rv;
rv = -1;
}
}
else
{
LDBG (0, "fd %d: calling libc_close", fd);
rv = libc_close (fd);
}
done:
return rv;
}
ssize_t
read (int fd, void *buf, size_t nbytes)
{
vls_handle_t vlsh;
ssize_t size;
if ((errno = -ldp_init ()))
return -1;
vlsh = ldp_fd_to_vlsh (fd);
if (vlsh != VLS_INVALID_HANDLE)
{
size = vls_read (vlsh, buf, nbytes);
if (size < 0)
{
errno = -size;
size = -1;
}
}
else
{
size = libc_read (fd, buf, nbytes);
}
return size;
}
ssize_t
readv (int fd, const struct iovec * iov, int iovcnt)
{
int rv = 0, i, total = 0;
vls_handle_t vlsh;
ssize_t size = 0;
if ((errno = -ldp_init ()))
return -1;
vlsh = ldp_fd_to_vlsh (fd);
if (vlsh != VLS_INVALID_HANDLE)
{
do
{
for (i = 0; i < iovcnt; ++i)
{
rv = vls_read (vlsh, iov[i].iov_base, iov[i].iov_len);
if (rv < 0)
break;
else
{
total += rv;
if (rv < iov[i].iov_len)
break;
}
}
}
while ((rv >= 0) && (total == 0));
if (rv < 0)
{
errno = -rv;
size = -1;
}
else
size = total;
}
else
{
size = libc_readv (fd, iov, iovcnt);
}
return size;
}
ssize_t
write (int fd, const void *buf, size_t nbytes)
{
vls_handle_t vlsh;
ssize_t size = 0;
if ((errno = -ldp_init ()))
return -1;
vlsh = ldp_fd_to_vlsh (fd);
if (vlsh != VLS_INVALID_HANDLE)
{
size = vls_write_msg (vlsh, (void *) buf, nbytes);
if (size < 0)
{
errno = -size;
size = -1;
}
}
else
{
size = libc_write (fd, buf, nbytes);
}
return size;
}
ssize_t
writev (int fd, const struct iovec * iov, int iovcnt)
{
ssize_t size = 0, total = 0;
vls_handle_t vlsh;
int i, rv = 0;
if ((errno = -ldp_init ()))
return -1;
vlsh = ldp_fd_to_vlsh (fd);
if (vlsh != VLS_INVALID_HANDLE)
{
do
{
for (i = 0; i < iovcnt; ++i)
{
rv = vls_write_msg (vlsh, iov[i].iov_base, iov[i].iov_len);
if (rv < 0)
break;
else
{
total += rv;
if (rv < iov[i].iov_len)
break;
}
}
}
while ((rv >= 0) && (total == 0));
if (rv < 0)
{
errno = -rv;
size = -1;
}
else
size = total;
}
else
{
size = libc_writev (fd, iov, iovcnt);
}
return size;
}
int
fcntl (int fd, int cmd, ...)
{
vls_handle_t vlsh;
int rv = 0;
va_list ap;
if ((errno = -ldp_init ()))
return -1;
va_start (ap, cmd);
vlsh = ldp_fd_to_vlsh (fd);
LDBG (0, "fd %u vlsh %d, cmd %u", fd, vlsh, cmd);
if (vlsh != VLS_INVALID_HANDLE)
{
int flags = va_arg (ap, int);
u32 size;
size = sizeof (flags);
rv = -EOPNOTSUPP;
switch (cmd)
{
case F_SETFL:
rv = vls_attr (vlsh, VPPCOM_ATTR_SET_FLAGS, &flags, &size);
break;
case F_GETFL:
rv = vls_attr (vlsh, VPPCOM_ATTR_GET_FLAGS, &flags, &size);
if (rv == VPPCOM_OK)
rv = flags;
break;
case F_SETFD:
/* TODO handle this */
LDBG (0, "F_SETFD ignored flags %u", flags);
rv = 0;
break;
default:
rv = -EOPNOTSUPP;
break;
}
if (rv < 0)
{
errno = -rv;
rv = -1;
}
}
else
{
rv = libc_vfcntl (fd, cmd, ap);
}
va_end (ap);
return rv;
}
int
ioctl (int fd, unsigned long int cmd, ...)
{
vls_handle_t vlsh;
va_list ap;
int rv;
if ((errno = -ldp_init ()))
return -1;
va_start (ap, cmd);
vlsh = ldp_fd_to_vlsh (fd);
if (vlsh != VLS_INVALID_HANDLE)
{
switch (cmd)
{
case FIONREAD:
rv = vls_attr (vlsh, VPPCOM_ATTR_GET_NREAD, 0, 0);
break;
case FIONBIO:
{
u32 flags = va_arg (ap, int) ? O_NONBLOCK : 0;
u32 size = sizeof (flags);
/* TBD: When VPPCOM_ATTR_[GS]ET_FLAGS supports flags other than
* non-blocking, the flags should be read here and merged
* with O_NONBLOCK.
*/
rv = vls_attr (vlsh, VPPCOM_ATTR_SET_FLAGS, &flags, &size);
}
break;
default:
rv = -EOPNOTSUPP;
break;
}
if (rv < 0)
{
errno = -rv;
rv = -1;
}
}
else
{
rv = libc_vioctl (fd, cmd, ap);
}
va_end (ap);
return rv;
}
always_inline void
ldp_select_init_maps (fd_set * __restrict original,
clib_bitmap_t ** resultb, clib_bitmap_t ** libcb,
clib_bitmap_t ** vclb, int nfds, u32 minbits,
u32 n_bytes, uword * si_bits, uword * libc_bits)
{
uword si_bits_set, libc_bits_set;
vls_handle_t vlsh;
int fd;
clib_bitmap_validate (*vclb, minbits);
clib_bitmap_validate (*libcb, minbits);
clib_bitmap_validate (*resultb, minbits);
clib_memcpy_fast (*resultb, original, n_bytes);
memset (original, 0, n_bytes);
/* *INDENT-OFF* */
clib_bitmap_foreach (fd, *resultb, ({
if (fd > nfds)
break;
vlsh = ldp_fd_to_vlsh (fd);
if (vlsh == VLS_INVALID_HANDLE)
clib_bitmap_set_no_check (*libcb, fd, 1);
else
clib_bitmap_set_no_check (*vclb, vlsh_to_session_index (vlsh), 1);
}));
/* *INDENT-ON* */
si_bits_set = clib_bitmap_last_set (*vclb) + 1;
*si_bits = (si_bits_set > *si_bits) ? si_bits_set : *si_bits;
libc_bits_set = clib_bitmap_last_set (*libcb) + 1;
*libc_bits = (libc_bits_set > *libc_bits) ? libc_bits_set : *libc_bits;
}
always_inline int
ldp_select_vcl_map_to_libc (clib_bitmap_t * vclb, fd_set * __restrict libcb)
{
vls_handle_t vlsh;
uword si;
int fd;
if (!libcb)
return 0;
/* *INDENT-OFF* */
clib_bitmap_foreach (si, vclb, ({
vlsh = vls_session_index_to_vlsh (si);
fd = ldp_vlsh_to_fd (vlsh);
if (PREDICT_FALSE (fd < 0))
{
errno = EBADFD;
return -1;
}
FD_SET (fd, libcb);
}));
/* *INDENT-ON* */
return 0;
}
always_inline void
ldp_select_libc_map_merge (clib_bitmap_t * result, fd_set * __restrict libcb)
{
uword fd;
if (!libcb)
return;
/* *INDENT-OFF* */
clib_bitmap_foreach (fd, result, ({
FD_SET ((int)fd, libcb);
}));
/* *INDENT-ON* */
}
int
ldp_pselect (int nfds, fd_set * __restrict readfds,
fd_set * __restrict writefds,
fd_set * __restrict exceptfds,
const struct timespec *__restrict timeout,
const __sigset_t * __restrict sigmask)
{
u32 minbits = clib_max (nfds, BITS (uword)), n_bytes;
ldp_worker_ctx_t *ldpw = ldp_worker_get_current ();
struct timespec libc_tspec = { 0 };
f64 time_out, vcl_timeout = 0;
uword si_bits, libc_bits;
int rv, bits_set = 0;
if (nfds < 0)
{
errno = EINVAL;
return -1;
}
if (PREDICT_FALSE (ldpw->clib_time.init_cpu_time == 0))
clib_time_init (&ldpw->clib_time);
if (timeout)
{
time_out = (timeout->tv_sec == 0 && timeout->tv_nsec == 0) ?
(f64) 0 : (f64) timeout->tv_sec + (f64) timeout->tv_nsec / (f64) 1e9;
/* select as fine grained sleep */
if (!nfds)
{
time_out += clib_time_now (&ldpw->clib_time);
while (clib_time_now (&ldpw->clib_time) < time_out)
;
return 0;
}
}
else if (!nfds)
{
errno = EINVAL;
return -1;
}
else
time_out = -1;
if (nfds <= ldp->vlsh_bit_val)
{
rv = libc_pselect (nfds, readfds, writefds, exceptfds,
timeout, sigmask);
goto done;
}
si_bits = libc_bits = 0;
n_bytes = nfds / 8 + ((nfds % 8) ? 1 : 0);
if (readfds)
ldp_select_init_maps (readfds, &ldpw->rd_bitmap, &ldpw->libc_rd_bitmap,
&ldpw->si_rd_bitmap, nfds, minbits, n_bytes,
&si_bits, &libc_bits);
if (writefds)
ldp_select_init_maps (writefds, &ldpw->wr_bitmap,
&ldpw->libc_wr_bitmap, &ldpw->si_wr_bitmap, nfds,
minbits, n_bytes, &si_bits, &libc_bits);
if (exceptfds)
ldp_select_init_maps (exceptfds, &ldpw->ex_bitmap,
&ldpw->libc_ex_bitmap, &ldpw->si_ex_bitmap, nfds,
minbits, n_bytes, &si_bits, &libc_bits);
if (PREDICT_FALSE (!si_bits && !libc_bits))
{
errno = EINVAL;
rv = -1;
goto done;
}
if (!si_bits)
libc_tspec = timeout ? *timeout : libc_tspec;
do
{
if (si_bits)
{
if (readfds)
clib_memcpy_fast (ldpw->rd_bitmap, ldpw->si_rd_bitmap,
vec_len (ldpw->rd_bitmap) *
sizeof (clib_bitmap_t));
if (writefds)
clib_memcpy_fast (ldpw->wr_bitmap, ldpw->si_wr_bitmap,
vec_len (ldpw->wr_bitmap) *
sizeof (clib_bitmap_t));
if (exceptfds)
clib_memcpy_fast (ldpw->ex_bitmap, ldpw->si_ex_bitmap,
vec_len (ldpw->ex_bitmap) *
sizeof (clib_bitmap_t));
rv = vls_select (si_bits, readfds ? ldpw->rd_bitmap : NULL,
writefds ? ldpw->wr_bitmap : NULL,
exceptfds ? ldpw->ex_bitmap : NULL, vcl_timeout);
if (rv < 0)
{
errno = -rv;
rv = -1;
}
else if (rv > 0)
{
if (ldp_select_vcl_map_to_libc (ldpw->rd_bitmap, readfds))
{
rv = -1;
goto done;
}
if (ldp_select_vcl_map_to_libc (ldpw->wr_bitmap, writefds))
{
rv = -1;
goto done;
}
if (ldp_select_vcl_map_to_libc (ldpw->ex_bitmap, exceptfds))
{
rv = -1;
goto done;
}
bits_set = rv;
}
}
if (libc_bits)
{
if (readfds)
clib_memcpy_fast (ldpw->rd_bitmap, ldpw->libc_rd_bitmap,
vec_len (ldpw->libc_rd_bitmap) *
sizeof (clib_bitmap_t));
if (writefds)
clib_memcpy_fast (ldpw->wr_bitmap, ldpw->libc_wr_bitmap,
vec_len (ldpw->libc_wr_bitmap) *
sizeof (clib_bitmap_t));
if (exceptfds)
clib_memcpy_fast (ldpw->ex_bitmap, ldpw->libc_ex_bitmap,
vec_len (ldpw->libc_ex_bitmap) *
sizeof (clib_bitmap_t));
rv = libc_pselect (libc_bits,
readfds ? (fd_set *) ldpw->rd_bitmap : NULL,
writefds ? (fd_set *) ldpw->wr_bitmap : NULL,
exceptfds ? (fd_set *) ldpw->ex_bitmap : NULL,
&libc_tspec, sigmask);
if (rv > 0)
{
ldp_select_libc_map_merge (ldpw->rd_bitmap, readfds);
ldp_select_libc_map_merge (ldpw->wr_bitmap, writefds);
ldp_select_libc_map_merge (ldpw->ex_bitmap, exceptfds);
bits_set += rv;
}
}
if (bits_set)
{
rv = bits_set;
goto done;
}
}
while ((time_out == -1) || (clib_time_now (&ldpw->clib_time) < time_out));
rv = 0;
done:
/* TBD: set timeout to amount of time left */
clib_bitmap_zero (ldpw->rd_bitmap);
clib_bitmap_zero (ldpw->si_rd_bitmap);
clib_bitmap_zero (ldpw->libc_rd_bitmap);
clib_bitmap_zero (ldpw->wr_bitmap);
clib_bitmap_zero (ldpw->si_wr_bitmap);
clib_bitmap_zero (ldpw->libc_wr_bitmap);
clib_bitmap_zero (ldpw->ex_bitmap);
clib_bitmap_zero (ldpw->si_ex_bitmap);
clib_bitmap_zero (ldpw->libc_ex_bitmap);
return rv;
}
int
select (int nfds, fd_set * __restrict readfds,
fd_set * __restrict writefds,
fd_set * __restrict exceptfds, struct timeval *__restrict timeout)
{
struct timespec tspec;
if (timeout)
{
tspec.tv_sec = timeout->tv_sec;
tspec.tv_nsec = timeout->tv_usec * 1000;
}
return ldp_pselect (nfds, readfds, writefds, exceptfds,
timeout ? &tspec : NULL, NULL);
}
#ifdef __USE_XOPEN2K
int
pselect (int nfds, fd_set * __restrict readfds,
fd_set * __restrict writefds,
fd_set * __restrict exceptfds,
const struct timespec *__restrict timeout,
const __sigset_t * __restrict sigmask)
{
return ldp_pselect (nfds, readfds, writefds, exceptfds, timeout, 0);
}
#endif
int
socket (int domain, int type, int protocol)
{
int rv, sock_type = type & ~(SOCK_CLOEXEC | SOCK_NONBLOCK);
u8 is_nonblocking = type & SOCK_NONBLOCK ? 1 : 0;
vls_handle_t vlsh;
if ((errno = -ldp_init ()))
return -1;
if (((domain == AF_INET) || (domain == AF_INET6)) &&
((sock_type == SOCK_STREAM) || (sock_type == SOCK_DGRAM)))
{
u8 proto = ((sock_type == SOCK_DGRAM) ?
VPPCOM_PROTO_UDP : VPPCOM_PROTO_TCP);
LDBG (0, "calling vls_create: proto %u (%s), is_nonblocking %u",
proto, vppcom_proto_str (proto), is_nonblocking);
vlsh = vls_create (proto, is_nonblocking);
if (vlsh < 0)
{
errno = -vlsh;
rv = -1;
}
else
{
rv = ldp_vlsh_to_fd (vlsh);
}
}
else
{
LDBG (0, "calling libc_socket");
rv = libc_socket (domain, type, protocol);
}
return rv;
}
/*
* Create two new sockets, of type TYPE in domain DOMAIN and using
* protocol PROTOCOL, which are connected to each other, and put file
* descriptors for them in FDS[0] and FDS[1]. If PROTOCOL is zero,
* one will be chosen automatically.
* Returns 0 on success, -1 for errors.
* */
int
socketpair (int domain, int type, int protocol, int fds[2])
{
int rv, sock_type = type & ~(SOCK_CLOEXEC | SOCK_NONBLOCK);
if ((errno = -ldp_init ()))
return -1;
if (((domain == AF_INET) || (domain == AF_INET6)) &&
((sock_type == SOCK_STREAM) || (sock_type == SOCK_DGRAM)))
{
LDBG (0, "LDP-TBD");
errno = ENOSYS;
rv = -1;
}
else
{
LDBG (1, "calling libc_socketpair");
rv = libc_socketpair (domain, type, protocol, fds);
}
return rv;
}
int
bind (int fd, __CONST_SOCKADDR_ARG addr, socklen_t len)
{
vls_handle_t vlsh;
int rv;
if ((errno = -ldp_init ()))
return -1;
vlsh = ldp_fd_to_vlsh (fd);
if (vlsh != VLS_INVALID_HANDLE)
{
vppcom_endpt_t ep;
switch (addr->sa_family)
{
case AF_INET:
if (len != sizeof (struct sockaddr_in))
{
LDBG (0, "ERROR: fd %d: vlsh %u: Invalid AF_INET addr len %u!",
fd, vlsh, len);
errno = EINVAL;
rv = -1;
goto done;
}
ep.is_ip4 = VPPCOM_IS_IP4;
ep.ip = (u8 *) & ((const struct sockaddr_in *) addr)->sin_addr;
ep.port = (u16) ((const struct sockaddr_in *) addr)->sin_port;
break;
case AF_INET6:
if (len != sizeof (struct sockaddr_in6))
{
LDBG (0, "ERROR: fd %d: vlsh %u: Invalid AF_INET6 addr len %u!",
fd, vlsh, len);
errno = EINVAL;
rv = -1;
goto done;
}
ep.is_ip4 = VPPCOM_IS_IP6;
ep.ip = (u8 *) & ((const struct sockaddr_in6 *) addr)->sin6_addr;
ep.port = (u16) ((const struct sockaddr_in6 *) addr)->sin6_port;
break;
default:
LDBG (0, "ERROR: fd %d: vlsh %u: Unsupported address family %u!",
fd, vlsh, addr->sa_family);
errno = EAFNOSUPPORT;
rv = -1;
goto done;
}
LDBG (0, "fd %d: calling vls_bind: vlsh %u, addr %p, len %u", fd, vlsh,
addr, len);
rv = vls_bind (vlsh, &ep);
if (rv != VPPCOM_OK)
{
errno = -rv;
rv = -1;
}
}
else
{
LDBG (0, "fd %d: calling libc_bind: addr %p, len %u", fd, addr, len);
rv = libc_bind (fd, addr, len);
}
done:
LDBG (1, "fd %d: returning %d", fd, rv);
return rv;
}
static inline int
ldp_copy_ep_to_sockaddr (__SOCKADDR_ARG addr, socklen_t * __restrict len,
vppcom_endpt_t * ep)
{
int rv = 0;
int sa_len, copy_len;
if ((errno = -ldp_init ()))
return -1;
if (addr && len && ep)
{
addr->sa_family = (ep->is_ip4 == VPPCOM_IS_IP4) ? AF_INET : AF_INET6;
switch (addr->sa_family)
{
case AF_INET:
((struct sockaddr_in *) addr)->sin_port = ep->port;
if (*len > sizeof (struct sockaddr_in))
*len = sizeof (struct sockaddr_in);
sa_len = sizeof (struct sockaddr_in) - sizeof (struct in_addr);
copy_len = *len - sa_len;
if (copy_len > 0)
memcpy (&((struct sockaddr_in *) addr)->sin_addr, ep->ip,
copy_len);
break;
case AF_INET6:
((struct sockaddr_in6 *) addr)->sin6_port = ep->port;
if (*len > sizeof (struct sockaddr_in6))
*len = sizeof (struct sockaddr_in6);
sa_len = sizeof (struct sockaddr_in6) - sizeof (struct in6_addr);
copy_len = *len - sa_len;
if (copy_len > 0)
memcpy (((struct sockaddr_in6 *) addr)->sin6_addr.
__in6_u.__u6_addr8, ep->ip, copy_len);
break;
default:
/* Not possible */
rv = -EAFNOSUPPORT;
break;
}
}
return rv;
}
int
getsockname (int fd, __SOCKADDR_ARG addr, socklen_t * __restrict len)
{
vls_handle_t vlsh;
int rv;
if ((errno = -ldp_init ()))
return -1;
vlsh = ldp_fd_to_vlsh (fd);
if (vlsh != VLS_INVALID_HANDLE)
{
vppcom_endpt_t ep;
u8 addr_buf[sizeof (struct in6_addr)];
u32 size = sizeof (ep);
ep.ip = addr_buf;
rv = vls_attr (vlsh, VPPCOM_ATTR_GET_LCL_ADDR, &ep, &size);
if (rv != VPPCOM_OK)
{
errno = -rv;
rv = -1;
}
else
{
rv = ldp_copy_ep_to_sockaddr (addr, len, &ep);
if (rv != VPPCOM_OK)
{
errno = -rv;
rv = -1;
}
}
}
else
{
rv = libc_getsockname (fd, addr, len);
}
return rv;
}
int
connect (int fd, __CONST_SOCKADDR_ARG addr, socklen_t len)
{
vls_handle_t vlsh;
int rv;
if ((errno = -ldp_init ()))
return -1;
if (!addr)
{
LDBG (0, "ERROR: fd %d: NULL addr, len %u", fd, len);
errno = EINVAL;
rv = -1;
goto done;
}
vlsh = ldp_fd_to_vlsh (fd);
if (vlsh != VLS_INVALID_HANDLE)
{
vppcom_endpt_t ep;
switch (addr->sa_family)
{
case AF_INET:
if (len != sizeof (struct sockaddr_in))
{
LDBG (0, "fd %d: ERROR vlsh %u: Invalid AF_INET addr len %u!",
fd, vlsh, len);
errno = EINVAL;
rv = -1;
goto done;
}
ep.is_ip4 = VPPCOM_IS_IP4;
ep.ip = (u8 *) & ((const struct sockaddr_in *) addr)->sin_addr;
ep.port = (u16) ((const struct sockaddr_in *) addr)->sin_port;
break;
case AF_INET6:
if (len != sizeof (struct sockaddr_in6))
{
LDBG (0, "fd %d: ERROR vlsh %u: Invalid AF_INET6 addr len %u!",
fd, vlsh, len);
errno = EINVAL;
rv = -1;
goto done;
}
ep.is_ip4 = VPPCOM_IS_IP6;
ep.ip = (u8 *) & ((const struct sockaddr_in6 *) addr)->sin6_addr;
ep.port = (u16) ((const struct sockaddr_in6 *) addr)->sin6_port;
break;
default:
LDBG (0, "fd %d: ERROR vlsh %u: Unsupported address family %u!",
fd, vlsh, addr->sa_family);
errno = EAFNOSUPPORT;
rv = -1;
goto done;
}
LDBG (0, "fd %d: calling vls_connect(): vlsh %u addr %p len %u", fd,
vlsh, addr, len);
rv = vls_connect (vlsh, &ep);
if (rv != VPPCOM_OK)
{
errno = -rv;
rv = -1;
}
}
else
{
LDBG (0, "fd %d: calling libc_connect(): addr %p, len %u",
fd, addr, len);
rv = libc_connect (fd, addr, len);
}
done:
LDBG (1, "fd %d: returning %d (0x%x)", fd, rv, rv);
return rv;
}
int
getpeername (int fd, __SOCKADDR_ARG addr, socklen_t * __restrict len)
{
vls_handle_t vlsh;
int rv;
if ((errno = -ldp_init ()))
return -1;
vlsh = ldp_fd_to_vlsh (fd);
if (vlsh != VLS_INVALID_HANDLE)
{
vppcom_endpt_t ep;
u8 addr_buf[sizeof (struct in6_addr)];
u32 size = sizeof (ep);
ep.ip = addr_buf;
rv = vls_attr (vlsh, VPPCOM_ATTR_GET_PEER_ADDR, &ep, &size);
if (rv != VPPCOM_OK)
{
errno = -rv;
rv = -1;
}
else
{
rv = ldp_copy_ep_to_sockaddr (addr, len, &ep);
if (rv != VPPCOM_OK)
{
errno = -rv;
rv = -1;
}
}
}
else
{
rv = libc_getpeername (fd, addr, len);
}
return rv;
}
ssize_t
send (int fd, const void *buf, size_t n, int flags)
{
vls_handle_t vlsh = ldp_fd_to_vlsh (fd);
ssize_t size;
if ((errno = -ldp_init ()))
return -1;
if (vlsh != VLS_INVALID_HANDLE)
{
size = vls_sendto (vlsh, (void *) buf, n, flags, NULL);
if (size < VPPCOM_OK)
{
errno = -size;
size = -1;
}
}
else
{
size = libc_send (fd, buf, n, flags);
}
return size;
}
ssize_t
sendfile (int out_fd, int in_fd, off_t * offset, size_t len)
{
ldp_worker_ctx_t *ldpw = ldp_worker_get_current ();
vls_handle_t vlsh;
ssize_t size = 0;
if ((errno = -ldp_init ()))
return -1;
vlsh = ldp_fd_to_vlsh (out_fd);
if (vlsh != VLS_INVALID_HANDLE)
{
int rv;
ssize_t results = 0;
size_t n_bytes_left = len;
size_t bytes_to_read;
int nbytes;
u8 eagain = 0;
u32 flags, flags_len = sizeof (flags);
rv = vls_attr (vlsh, VPPCOM_ATTR_GET_FLAGS, &flags, &flags_len);
if (PREDICT_FALSE (rv != VPPCOM_OK))
{
LDBG (0, "ERROR: out fd %d: vls_attr: vlsh %u, returned %d (%s)!",
out_fd, vlsh, rv, vppcom_retval_str (rv));
vec_reset_length (ldpw->io_buffer);
errno = -rv;
size = -1;
goto done;
}
if (offset)
{
off_t off = lseek (in_fd, *offset, SEEK_SET);
if (PREDICT_FALSE (off == -1))
{
size = -1;
goto done;
}
ASSERT (off == *offset);
}
do
{
size = vls_attr (vlsh, VPPCOM_ATTR_GET_NWRITE, 0, 0);
if (size < 0)
{
LDBG (0, "ERROR: fd %d: vls_attr: vlsh %u returned %d (%s)!",
out_fd, vlsh, size, vppcom_retval_str (size));
vec_reset_length (ldpw->io_buffer);
errno = -size;
size = -1;
goto done;
}
bytes_to_read = size;
if (bytes_to_read == 0)
{
if (flags & O_NONBLOCK)
{
if (!results)
eagain = 1;
goto update_offset;
}
else
continue;
}
bytes_to_read = clib_min (n_bytes_left, bytes_to_read);
vec_validate (ldpw->io_buffer, bytes_to_read);
nbytes = libc_read (in_fd, ldpw->io_buffer, bytes_to_read);
if (nbytes < 0)
{
if (results == 0)
{
vec_reset_length (ldpw->io_buffer);
size = -1;
goto done;
}
goto update_offset;
}
size = vls_write (vlsh, ldpw->io_buffer, nbytes);
if (size < 0)
{
if (size == VPPCOM_EAGAIN)
{
if (flags & O_NONBLOCK)
{
if (!results)
eagain = 1;
goto update_offset;
}
else
continue;
}
if (results == 0)
{
vec_reset_length (ldpw->io_buffer);
errno = -size;
size = -1;
goto done;
}
goto update_offset;
}
results += nbytes;
ASSERT (n_bytes_left >= nbytes);
n_bytes_left = n_bytes_left - nbytes;
}
while (n_bytes_left > 0);
update_offset:
vec_reset_length (ldpw->io_buffer);
if (offset)
{
off_t off = lseek (in_fd, *offset, SEEK_SET);
if (PREDICT_FALSE (off == -1))
{
size = -1;
goto done;
}
ASSERT (off == *offset);
*offset += results + 1;
}
if (eagain)
{
errno = EAGAIN;
size = -1;
}
else
size = results;
}
else
{
size = libc_sendfile (out_fd, in_fd, offset, len);
}
done:
return size;
}
ssize_t
sendfile64 (int out_fd, int in_fd, off_t * offset, size_t len)
{
return sendfile (out_fd, in_fd, offset, len);
}
ssize_t
recv (int fd, void *buf, size_t n, int flags)
{
vls_handle_t vlsh;
ssize_t size;
if ((errno = -ldp_init ()))
return -1;
vlsh = ldp_fd_to_vlsh (fd);
if (vlsh != VLS_INVALID_HANDLE)
{
size = vls_recvfrom (vlsh, buf, n, flags, NULL);
if (size < 0)
errno = -size;
}
else
{
size = libc_recv (fd, buf, n, flags);
}
return size;
}
ssize_t
sendto (int fd, const void *buf, size_t n, int flags,
__CONST_SOCKADDR_ARG addr, socklen_t addr_len)
{
vls_handle_t vlsh;
ssize_t size;
if ((errno = -ldp_init ()))
return -1;
vlsh = ldp_fd_to_vlsh (fd);
if (vlsh != INVALID_SESSION_ID)
{
vppcom_endpt_t *ep = 0;
vppcom_endpt_t _ep;
if (addr)
{
ep = &_ep;
switch (addr->sa_family)
{
case AF_INET:
ep->is_ip4 = VPPCOM_IS_IP4;
ep->ip =
(uint8_t *) & ((const struct sockaddr_in *) addr)->sin_addr;
ep->port =
(uint16_t) ((const struct sockaddr_in *) addr)->sin_port;
break;
case AF_INET6:
ep->is_ip4 = VPPCOM_IS_IP6;
ep->ip =
(uint8_t *) & ((const struct sockaddr_in6 *) addr)->sin6_addr;
ep->port =
(uint16_t) ((const struct sockaddr_in6 *) addr)->sin6_port;
break;
default:
errno = EAFNOSUPPORT;
size = -1;
goto done;
}
}
size = vls_sendto (vlsh, (void *) buf, n, flags, ep);
if (size < 0)
{
errno = -size;
size = -1;
}
}
else
{
size = libc_sendto (fd, buf, n, flags, addr, addr_len);
}
done:
return size;
}
ssize_t
recvfrom (int fd, void *__restrict buf, size_t n, int flags,
__SOCKADDR_ARG addr, socklen_t * __restrict addr_len)
{
vls_handle_t sid;
ssize_t size, rv;
if ((errno = -ldp_init ()))
return -1;
sid = ldp_fd_to_vlsh (fd);
if (sid != VLS_INVALID_HANDLE)
{
vppcom_endpt_t ep;
u8 src_addr[sizeof (struct sockaddr_in6)];
if (addr)
{
ep.ip = src_addr;
size = vls_recvfrom (sid, buf, n, flags, &ep);
if (size > 0)
{
rv = ldp_copy_ep_to_sockaddr (addr, addr_len, &ep);
if (rv < 0)
size = rv;
}
}
else
size = vls_recvfrom (sid, buf, n, flags, NULL);
if (size < 0)
{
errno = -size;
size = -1;
}
}
else
{
size = libc_recvfrom (fd, buf, n, flags, addr, addr_len);
}
return size;
}
ssize_t
sendmsg (int fd, const struct msghdr * message, int flags)
{
vls_handle_t vlsh;
ssize_t size;
if ((errno = -ldp_init ()))
return -1;
vlsh = ldp_fd_to_vlsh (fd);
if (vlsh != VLS_INVALID_HANDLE)
{
LDBG (0, "LDP-TBD");
errno = ENOSYS;
size = -1;
}
else
{
size = libc_sendmsg (fd, message, flags);
}
return size;
}
#ifdef USE_GNU
int
sendmmsg (int fd, struct mmsghdr *vmessages, unsigned int vlen, int flags)
{
ssize_t size;
const char *func_str;
u32 sh = ldp_fd_to_vlsh (fd);
if ((errno = -ldp_init ()))
return -1;
if (sh != INVALID_SESSION_ID)
{
clib_warning ("LDP<%d>: LDP-TBD", getpid ());
errno = ENOSYS;
size = -1;
}
else
{
func_str = "libc_sendmmsg";
if (LDP_DEBUG > 2)
clib_warning ("LDP<%d>: fd %d (0x%x): calling %s(): "
"vmessages %p, vlen %u, flags 0x%x",
getpid (), fd, fd, func_str, vmessages, vlen, flags);
size = libc_sendmmsg (fd, vmessages, vlen, flags);
}
if (LDP_DEBUG > 2)
{
if (size < 0)
{
int errno_val = errno;
perror (func_str);
clib_warning ("LDP<%d>: ERROR: fd %d (0x%x): %s() failed! "
"rv %d, errno = %d", getpid (), fd, fd,
func_str, size, errno_val);
errno = errno_val;
}
else
clib_warning ("LDP<%d>: fd %d (0x%x): returning %d (0x%x)",
getpid (), fd, fd, size, size);
}
return size;
}
#endif
ssize_t
recvmsg (int fd, struct msghdr * message, int flags)
{
vls_handle_t vlsh;
ssize_t size;
if ((errno = -ldp_init ()))
return -1;
vlsh = ldp_fd_to_vlsh (fd);
if (vlsh != VLS_INVALID_HANDLE)
{
LDBG (0, "LDP-TBD");
errno = ENOSYS;
size = -1;
}
else
{
size = libc_recvmsg (fd, message, flags);
}
return size;
}
#ifdef USE_GNU
int
recvmmsg (int fd, struct mmsghdr *vmessages,
unsigned int vlen, int flags, struct timespec *tmo)
{
ssize_t size;
const char *func_str;
u32 sh = ldp_fd_to_vlsh (fd);
if ((errno = -ldp_init ()))
return -1;
if (sh != INVALID_SESSION_ID)
{
clib_warning ("LDP<%d>: LDP-TBD", getpid ());
errno = ENOSYS;
size = -1;
}
else
{
func_str = "libc_recvmmsg";
if (LDP_DEBUG > 2)
clib_warning ("LDP<%d>: fd %d (0x%x): calling %s(): "
"vmessages %p, vlen %u, flags 0x%x, tmo %p",
getpid (), fd, fd, func_str, vmessages, vlen,
flags, tmo);
size = libc_recvmmsg (fd, vmessages, vlen, flags, tmo);
}
if (LDP_DEBUG > 2)
{
if (size < 0)
{
int errno_val = errno;
perror (func_str);
clib_warning ("LDP<%d>: ERROR: fd %d (0x%x): %s() failed! "
"rv %d, errno = %d", getpid (), fd, fd,
func_str, size, errno_val);
errno = errno_val;
}
else
clib_warning ("LDP<%d>: fd %d (0x%x): returning %d (0x%x)",
getpid (), fd, fd, size, size);
}
return size;
}
#endif
int
getsockopt (int fd, int level, int optname,
void *__restrict optval, socklen_t * __restrict optlen)
{
vls_handle_t vlsh;
int rv;
if ((errno = -ldp_init ()))
return -1;
vlsh = ldp_fd_to_vlsh (fd);
if (vlsh != VLS_INVALID_HANDLE)
{
rv = -EOPNOTSUPP;
switch (level)
{
case SOL_TCP:
switch (optname)
{
case TCP_NODELAY:
rv = vls_attr (vlsh, VPPCOM_ATTR_GET_TCP_NODELAY,
optval, optlen);
break;
case TCP_MAXSEG:
rv = vls_attr (vlsh, VPPCOM_ATTR_GET_TCP_USER_MSS,
optval, optlen);
break;
case TCP_KEEPIDLE:
rv = vls_attr (vlsh, VPPCOM_ATTR_GET_TCP_KEEPIDLE,
optval, optlen);
break;
case TCP_KEEPINTVL:
rv = vls_attr (vlsh, VPPCOM_ATTR_GET_TCP_KEEPINTVL,
optval, optlen);
break;
case TCP_INFO:
if (optval && optlen && (*optlen == sizeof (struct tcp_info)))
{
LDBG (1, "fd %d: vlsh %u SOL_TCP, TCP_INFO, optval %p, "
"optlen %d: #LDP-NOP#", fd, vlsh, optval, *optlen);
memset (optval, 0, *optlen);
rv = VPPCOM_OK;
}
else
rv = -EFAULT;
break;
case TCP_CONGESTION:
strcpy (optval, "cubic");
*optlen = strlen ("cubic");
rv = 0;
break;
default:
LDBG (0, "ERROR: fd %d: getsockopt SOL_TCP: sid %u, "
"optname %d unsupported!", fd, vlsh, optname);
break;
}
break;
case SOL_IPV6:
switch (optname)
{
case IPV6_V6ONLY:
rv = vls_attr (vlsh, VPPCOM_ATTR_GET_V6ONLY, optval, optlen);
break;
default:
LDBG (0, "ERROR: fd %d: getsockopt SOL_IPV6: vlsh %u "
"optname %d unsupported!", fd, vlsh, optname);
break;
}
break;
case SOL_SOCKET:
switch (optname)
{
case SO_ACCEPTCONN:
rv = vls_attr (vlsh, VPPCOM_ATTR_GET_LISTEN, optval, optlen);
break;
case SO_KEEPALIVE:
rv = vls_attr (vlsh, VPPCOM_ATTR_GET_KEEPALIVE, optval, optlen);
break;
case SO_PROTOCOL:
rv = vls_attr (vlsh, VPPCOM_ATTR_GET_PROTOCOL, optval, optlen);
*(int *) optval = *(int *) optval ? SOCK_DGRAM : SOCK_STREAM;
break;
case SO_SNDBUF:
rv = vls_attr (vlsh, VPPCOM_ATTR_GET_TX_FIFO_LEN,
optval, optlen);
break;
case SO_RCVBUF:
rv = vls_attr (vlsh, VPPCOM_ATTR_GET_RX_FIFO_LEN,
optval, optlen);
break;
case SO_REUSEADDR:
rv = vls_attr (vlsh, VPPCOM_ATTR_GET_REUSEADDR, optval, optlen);
break;
case SO_BROADCAST:
rv = vls_attr (vlsh, VPPCOM_ATTR_GET_BROADCAST, optval, optlen);
break;
case SO_ERROR:
rv = vls_attr (vlsh, VPPCOM_ATTR_GET_ERROR, optval, optlen);
break;
default:
LDBG (0, "ERROR: fd %d: getsockopt SOL_SOCKET: vlsh %u "
"optname %d unsupported!", fd, vlsh, optname);
break;
}
break;
default:
break;
}
if (rv != VPPCOM_OK)
{
errno = -rv;
rv = -1;
}
}
else
{
rv = libc_getsockopt (fd, level, optname, optval, optlen);
}
return rv;
}
int
setsockopt (int fd, int level, int optname,
const void *optval, socklen_t optlen)
{
vls_handle_t vlsh;
int rv;
if ((errno = -ldp_init ()))
return -1;
vlsh = ldp_fd_to_vlsh (fd);
if (vlsh != VLS_INVALID_HANDLE)
{
rv = -EOPNOTSUPP;
switch (level)
{
case SOL_TCP:
switch (optname)
{
case TCP_NODELAY:
rv = vls_attr (vlsh, VPPCOM_ATTR_SET_TCP_NODELAY,
(void *) optval, &optlen);
break;
case TCP_MAXSEG:
rv = vls_attr (vlsh, VPPCOM_ATTR_SET_TCP_USER_MSS,
(void *) optval, &optlen);
break;
case TCP_KEEPIDLE:
rv = vls_attr (vlsh, VPPCOM_ATTR_SET_TCP_KEEPIDLE,
(void *) optval, &optlen);
break;
case TCP_KEEPINTVL:
rv = vls_attr (vlsh, VPPCOM_ATTR_SET_TCP_KEEPINTVL,
(void *) optval, &optlen);
break;
case TCP_CONGESTION:
case TCP_CORK:
/* Ignore */
rv = 0;
break;
default:
LDBG (0, "ERROR: fd %d: setsockopt() SOL_TCP: vlsh %u"
"optname %d unsupported!", fd, vlsh, optname);
break;
}
break;
case SOL_IPV6:
switch (optname)
{
case IPV6_V6ONLY:
rv = vls_attr (vlsh, VPPCOM_ATTR_SET_V6ONLY,
(void *) optval, &optlen);
break;
default:
LDBG (0, "ERROR: fd %d: setsockopt SOL_IPV6: vlsh %u"
"optname %d unsupported!", fd, vlsh, optname);
break;
}
break;
case SOL_SOCKET:
switch (optname)
{
case SO_KEEPALIVE:
rv = vls_attr (vlsh, VPPCOM_ATTR_SET_KEEPALIVE,
(void *) optval, &optlen);
break;
case SO_REUSEADDR:
rv = vls_attr (vlsh, VPPCOM_ATTR_SET_REUSEADDR,
(void *) optval, &optlen);
break;
case SO_BROADCAST:
rv = vls_attr (vlsh, VPPCOM_ATTR_SET_BROADCAST,
(void *) optval, &optlen);
break;
default:
LDBG (0, "ERROR: fd %d: setsockopt SOL_SOCKET: vlsh %u "
"optname %d unsupported!", fd, vlsh, optname);
break;
}
break;
default:
break;
}
if (rv != VPPCOM_OK)
{
errno = -rv;
rv = -1;
}
}
else
{
rv = libc_setsockopt (fd, level, optname, optval, optlen);
}
return rv;
}
int
listen (int fd, int n)
{
vls_handle_t vlsh;
int rv;
if ((errno = -ldp_init ()))
return -1;
vlsh = ldp_fd_to_vlsh (fd);
if (vlsh != VLS_INVALID_HANDLE)
{
LDBG (0, "fd %d: calling vls_listen: vlsh %u, n %d", fd, vlsh, n);
rv = vls_listen (vlsh, n);
if (rv != VPPCOM_OK)
{
errno = -rv;
rv = -1;
}
}
else
{
LDBG (0, "fd %d: calling libc_listen(): n %d", fd, n);
rv = libc_listen (fd, n);
}
LDBG (1, "fd %d: returning %d", fd, rv);
return rv;
}
static inline int
ldp_accept4 (int listen_fd, __SOCKADDR_ARG addr,
socklen_t * __restrict addr_len, int flags)
{
vls_handle_t listen_vlsh, accept_vlsh;
int rv;
if ((errno = -ldp_init ()))
return -1;
listen_vlsh = ldp_fd_to_vlsh (listen_fd);
if (listen_vlsh != VLS_INVALID_HANDLE)
{
vppcom_endpt_t ep;
u8 src_addr[sizeof (struct sockaddr_in6)];
memset (&ep, 0, sizeof (ep));
ep.ip = src_addr;
LDBG (0, "listen fd %d: calling vppcom_session_accept: listen sid %u,"
" ep %p, flags 0x%x", listen_fd, listen_vlsh, ep, flags);
accept_vlsh = vls_accept (listen_vlsh, &ep, flags);
if (accept_vlsh < 0)
{
errno = -accept_vlsh;
rv = -1;
}
else
{
rv = ldp_copy_ep_to_sockaddr (addr, addr_len, &ep);
if (rv != VPPCOM_OK)
{
(void) vls_close (accept_vlsh);
errno = -rv;
rv = -1;
}
else
{
rv = ldp_vlsh_to_fd (accept_vlsh);
}
}
}
else
{
LDBG (0, "listen fd %d: calling libc_accept4(): addr %p, addr_len %p,"
" flags 0x%x", listen_fd, addr, addr_len, flags);
rv = libc_accept4 (listen_fd, addr, addr_len, flags);
}
LDBG (1, "listen fd %d: accept returning %d", listen_fd, rv);
return rv;
}
int
accept4 (int fd, __SOCKADDR_ARG addr, socklen_t * __restrict addr_len,
int flags)
{
return ldp_accept4 (fd, addr, addr_len, flags);
}
int
accept (int fd, __SOCKADDR_ARG addr, socklen_t * __restrict addr_len)
{
return ldp_accept4 (fd, addr, addr_len, 0);
}
int
shutdown (int fd, int how)
{
vls_handle_t vlsh;
int rv = 0, flags;
u32 flags_len = sizeof (flags);
if ((errno = -ldp_init ()))
return -1;
vlsh = ldp_fd_to_vlsh (fd);
if (vlsh != VLS_INVALID_HANDLE)
{
LDBG (0, "called shutdown: fd %u vlsh %u how %d", fd, vlsh, how);
if (vls_attr (vlsh, VPPCOM_ATTR_SET_SHUT, &how, &flags_len))
{
close (fd);
return -1;
}
if (vls_attr (vlsh, VPPCOM_ATTR_GET_SHUT, &flags, &flags_len))
{
close (fd);
return -1;
}
if (flags == SHUT_RDWR)
rv = close (fd);
}
else
{
LDBG (0, "fd %d: calling libc_shutdown: how %d", fd, how);
rv = libc_shutdown (fd, how);
}
return rv;
}
int
epoll_create1 (int flags)
{
ldp_worker_ctx_t *ldpw = ldp_worker_get_current ();
vls_handle_t vlsh;
int rv;
if ((errno = -ldp_init ()))
return -1;
if (ldp->vcl_needs_real_epoll)
{
/* Make sure workers have been allocated */
if (!ldp->workers)
{
ldp_alloc_workers ();
ldpw = ldp_worker_get_current ();
}
rv = libc_epoll_create1 (flags);
ldp->vcl_needs_real_epoll = 0;
ldpw->vcl_mq_epfd = rv;
LDBG (0, "created vcl epfd %u", rv);
return rv;
}
vlsh = vls_epoll_create ();
if (PREDICT_FALSE (vlsh == VLS_INVALID_HANDLE))
{
errno = -vlsh;
rv = -1;
}
else
{
rv = ldp_vlsh_to_fd (vlsh);
}
LDBG (0, "epoll_create epfd %u vlsh %u", rv, vlsh);
return rv;
}
int
epoll_create (int size)
{
return epoll_create1 (0);
}
int
epoll_ctl (int epfd, int op, int fd, struct epoll_event *event)
{
vls_handle_t vep_vlsh, vlsh;
int rv;
if ((errno = -ldp_init ()))
return -1;
vep_vlsh = ldp_fd_to_vlsh (epfd);
if (PREDICT_FALSE (vep_vlsh == VLS_INVALID_HANDLE))
{
/* The LDP epoll_create1 always creates VCL epfd's.
* The app should never have a kernel base epoll fd unless it
* was acquired outside of the LD_PRELOAD process context.
* In any case, if we get one, punt it to libc_epoll_ctl.
*/
LDBG (1, "epfd %d: calling libc_epoll_ctl: op %d, fd %d"
" event %p", epfd, op, fd, event);
rv = libc_epoll_ctl (epfd, op, fd, event);
goto done;
}
vlsh = ldp_fd_to_vlsh (fd);
LDBG (0, "epfd %d ep_vlsh %d, fd %u vlsh %d, op %u", epfd, vep_vlsh, fd,
vlsh, op);
if (vlsh != VLS_INVALID_HANDLE)
{
LDBG (1, "epfd %d: calling vls_epoll_ctl: ep_vlsh %d op %d, vlsh %u,"
" event %p", epfd, vep_vlsh, vlsh, event);
rv = vls_epoll_ctl (vep_vlsh, op, vlsh, event);
if (rv != VPPCOM_OK)
{
errno = -rv;
rv = -1;
}
}
else
{
int libc_epfd;
u32 size = sizeof (epfd);
libc_epfd = vls_attr (vep_vlsh, VPPCOM_ATTR_GET_LIBC_EPFD, 0, 0);
if (!libc_epfd)
{
LDBG (1, "epfd %d, vep_vlsh %d calling libc_epoll_create1: "
"EPOLL_CLOEXEC", epfd, vep_vlsh);
libc_epfd = libc_epoll_create1 (EPOLL_CLOEXEC);
if (libc_epfd < 0)
{
rv = libc_epfd;
goto done;
}
rv = vls_attr (vep_vlsh, VPPCOM_ATTR_SET_LIBC_EPFD, &libc_epfd,
&size);
if (rv < 0)
{
errno = -rv;
rv = -1;
goto done;
}
}
else if (PREDICT_FALSE (libc_epfd < 0))
{
errno = -epfd;
rv = -1;
goto done;
}
LDBG (1, "epfd %d: calling libc_epoll_ctl: libc_epfd %d, op %d, fd %d,"
" event %p", epfd, libc_epfd, op, fd, event);
rv = libc_epoll_ctl (libc_epfd, op, fd, event);
}
done:
return rv;
}
static inline int
ldp_epoll_pwait (int epfd, struct epoll_event *events, int maxevents,
int timeout, const sigset_t * sigmask)
{
ldp_worker_ctx_t *ldpw = ldp_worker_get_current ();
double time_to_wait = (double) 0, max_time;
int libc_epfd, rv = 0;
vls_handle_t ep_vlsh;
if ((errno = -ldp_init ()))
return -1;
if (PREDICT_FALSE (!events || (timeout < -1)))
{
errno = EFAULT;
return -1;
}
if (epfd == ldpw->vcl_mq_epfd)
return libc_epoll_pwait (epfd, events, maxevents, timeout, sigmask);
ep_vlsh = ldp_fd_to_vlsh (epfd);
if (PREDICT_FALSE (ep_vlsh == VLS_INVALID_HANDLE))
{
LDBG (0, "epfd %d: bad ep_vlsh %d!", epfd, ep_vlsh);
errno = EBADFD;
return -1;
}
if (PREDICT_FALSE (ldpw->clib_time.init_cpu_time == 0))
clib_time_init (&ldpw->clib_time);
time_to_wait = ((timeout >= 0) ? (double) timeout / 1000 : 0);
max_time = clib_time_now (&ldpw->clib_time) + time_to_wait;
libc_epfd = vls_attr (ep_vlsh, VPPCOM_ATTR_GET_LIBC_EPFD, 0, 0);
if (PREDICT_FALSE (libc_epfd < 0))
{
errno = -libc_epfd;
rv = -1;
goto done;
}
LDBG (2, "epfd %d: vep_idx %d, libc_epfd %d, events %p, maxevents %d, "
"timeout %d, sigmask %p: time_to_wait %.02f", epfd, ep_vlsh,
libc_epfd, events, maxevents, timeout, sigmask, time_to_wait);
do
{
if (!ldpw->epoll_wait_vcl)
{
rv = vls_epoll_wait (ep_vlsh, events, maxevents, 0);
if (rv > 0)
{
ldpw->epoll_wait_vcl = 1;
goto done;
}
else if (rv < 0)
{
errno = -rv;
rv = -1;
goto done;
}
}
else
ldpw->epoll_wait_vcl = 0;
if (libc_epfd > 0)
{
rv = libc_epoll_pwait (libc_epfd, events, maxevents, 0, sigmask);
if (rv != 0)
goto done;
}
}
while ((timeout == -1) || (clib_time_now (&ldpw->clib_time) < max_time));
done:
return rv;
}
int
epoll_pwait (int epfd, struct epoll_event *events,
int maxevents, int timeout, const sigset_t * sigmask)
{
return ldp_epoll_pwait (epfd, events, maxevents, timeout, sigmask);
}
int
epoll_wait (int epfd, struct epoll_event *events, int maxevents, int timeout)
{
return ldp_epoll_pwait (epfd, events, maxevents, timeout, NULL);
}
int
poll (struct pollfd *fds, nfds_t nfds, int timeout)
{
ldp_worker_ctx_t *ldpw = ldp_worker_get_current ();
int rv, i, n_revents = 0;
vls_handle_t vlsh;
vcl_poll_t *vp;
double max_time;
LDBG (3, "fds %p, nfds %d, timeout %d", fds, nfds, timeout);
if (PREDICT_FALSE (ldpw->clib_time.init_cpu_time == 0))
clib_time_init (&ldpw->clib_time);
max_time = (timeout >= 0) ? (f64) timeout / 1000 : 0;
max_time += clib_time_now (&ldpw->clib_time);
for (i = 0; i < nfds; i++)
{
if (fds[i].fd < 0)
continue;
vlsh = ldp_fd_to_vlsh (fds[i].fd);
if (vlsh != VLS_INVALID_HANDLE)
{
fds[i].fd = -fds[i].fd;
vec_add2 (ldpw->vcl_poll, vp, 1);
vp->fds_ndx = i;
vp->sh = vlsh_to_sh (vlsh);
vp->events = fds[i].events;
#ifdef __USE_XOPEN2K
if (fds[i].events & POLLRDNORM)
vp->events |= POLLIN;
if (fds[i].events & POLLWRNORM)
vp->events |= POLLOUT;
#endif
vp->revents = fds[i].revents;
}
else
{
vec_add1 (ldpw->libc_poll, fds[i]);
vec_add1 (ldpw->libc_poll_idxs, i);
}
}
do
{
if (vec_len (ldpw->vcl_poll))
{
rv = vppcom_poll (ldpw->vcl_poll, vec_len (ldpw->vcl_poll), 0);
if (rv < 0)
{
errno = -rv;
rv = -1;
goto done;
}
else
n_revents += rv;
}
if (vec_len (ldpw->libc_poll))
{
rv = libc_poll (ldpw->libc_poll, vec_len (ldpw->libc_poll), 0);
if (rv < 0)
goto done;
else
n_revents += rv;
}
if (n_revents)
{
rv = n_revents;
goto done;
}
}
while ((timeout < 0) || (clib_time_now (&ldpw->clib_time) < max_time));
rv = 0;
done:
vec_foreach (vp, ldpw->vcl_poll)
{
fds[vp->fds_ndx].fd = -fds[vp->fds_ndx].fd;
fds[vp->fds_ndx].revents = vp->revents;
#ifdef __USE_XOPEN2K
if ((fds[vp->fds_ndx].revents & POLLIN) &&
(fds[vp->fds_ndx].events & POLLRDNORM))
fds[vp->fds_ndx].revents |= POLLRDNORM;
if ((fds[vp->fds_ndx].revents & POLLOUT) &&
(fds[vp->fds_ndx].events & POLLWRNORM))
fds[vp->fds_ndx].revents |= POLLWRNORM;
#endif
}
vec_reset_length (ldpw->vcl_poll);
for (i = 0; i < vec_len (ldpw->libc_poll); i++)
{
fds[ldpw->libc_poll_idxs[i]].revents = ldpw->libc_poll[i].revents;
}
vec_reset_length (ldpw->libc_poll_idxs);
vec_reset_length (ldpw->libc_poll);
return rv;
}
#ifdef USE_GNU
int
ppoll (struct pollfd *fds, nfds_t nfds,
const struct timespec *timeout, const sigset_t * sigmask)
{
if ((errno = -ldp_init ()))
return -1;
clib_warning ("LDP<%d>: LDP-TBD", getpid ());
errno = ENOSYS;
return -1;
}
#endif
void CONSTRUCTOR_ATTRIBUTE ldp_constructor (void);
void DESTRUCTOR_ATTRIBUTE ldp_destructor (void);
/*
* This function is called when the library is loaded
*/
void
ldp_constructor (void)
{
swrap_constructor ();
if (ldp_init () != 0)
{
fprintf (stderr, "\nLDP<%d>: ERROR: ldp_constructor: failed!\n",
getpid ());
_exit (1);
}
else if (LDP_DEBUG > 0)
clib_warning ("LDP<%d>: LDP constructor: done!\n", getpid ());
}
/*
* This function is called when the library is unloaded
*/
void
ldp_destructor (void)
{
/*
swrap_destructor ();
if (ldp->init)
ldp->init = 0;
*/
/* Don't use clib_warning() here because that calls writev()
* which will call ldp_init().
*/
if (LDP_DEBUG > 0)
fprintf (stderr, "%s:%d: LDP<%d>: LDP destructor: done!\n",
__func__, __LINE__, getpid ());
}
/*
* fd.io coding-style-patch-verification: ON
*
* Local Variables:
* eval: (c-set-style "gnu")
* End:
*/