/* * Copyright (c) 2021 Cisco and/or its affiliates. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #include "netns.h" #include "rtnl.h" #undef DBL_MAX #define DBL_MAX 1000000000.0 typedef enum { RTNL_E_OPEN, RTNL_E_CLOSE, RTNL_E_READ, } rtnl_event_t; typedef enum { RTNL_S_INIT, RTNL_S_SYNC, RTNL_S_READY, } rtnl_state_t; typedef enum { RTNL_SS_OPENING, RTNL_SS_LINK, RTNL_SS_ADDR, RTNL_SS_ROUTE4, RTNL_SS_ROUTE6, RTNL_SS_NEIGH, } rtnl_sync_state_t; typedef struct { rtnl_stream_t stream; rtnl_state_t state; rtnl_sync_state_t sync_state; int ns_fd; int rtnl_socket; u32 unix_index; u32 rtnl_seq; f64 timeout; } rtnl_ns_t; typedef struct { f64 now; rtnl_ns_t *streams; } rtnl_main_t; static rtnl_main_t rtnl_main; static vlib_node_registration_t rtnl_process_node; #define RTNL_BUFFSIZ 16384 #define RTNL_DUMP_TIMEOUT 1 static inline u32 grpmask (u32 g) { ASSERT (g <= 31); if (g) { return 1 << (g - 1); } else return 0; } u8 * format_rtnl_nsname2path (u8 *s, va_list *args) { char *nsname = va_arg (*args, char *); if (!nsname || !strlen (nsname)) { return format (s, "/proc/self/ns/net"); } else if (strpbrk (nsname, "/") != NULL) { return format (s, "%s", nsname); } else { return format (s, "/var/run/netns/%s", nsname); } } static_always_inline void rtnl_schedule_timeout (rtnl_ns_t *ns, f64 when) { ns->timeout = when; } static_always_inline void rtnl_cancel_timeout (rtnl_ns_t *ns) { ns->timeout = DBL_MAX; } static clib_error_t * rtnl_read_cb (struct clib_file *f) { rtnl_main_t *rm = &rtnl_main; vlib_main_t *vm = vlib_get_main (); rtnl_ns_t *ns = &rm->streams[f->private_data]; vlib_process_signal_event (vm, rtnl_process_node.index, RTNL_E_READ, (uword) (ns - rm->streams)); return 0; } int rtnl_dump_request (rtnl_ns_t *ns, int type, void *req, size_t len) { struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK }; struct nlmsghdr nlh = { .nlmsg_len = NLMSG_LENGTH (len), .nlmsg_type = type, .nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, .nlmsg_pid = 0, .nlmsg_seq = ++ns->rtnl_seq, }; struct iovec iov[2] = { { .iov_base = &nlh, .iov_len = sizeof (nlh) }, { .iov_base = req, .iov_len = len } }; struct msghdr msg = { .msg_name = &nladdr, .msg_namelen = sizeof (nladdr), .msg_iov = iov, .msg_iovlen = 2, }; if (sendmsg (ns->rtnl_socket, &msg, 0) < 0) { clib_warning ("sendmsg error: %s", strerror (errno)); return -1; } return 0; } static void rtnl_socket_close (rtnl_ns_t *ns) { clib_file_del (&file_main, &file_main.file_pool[ns->unix_index]); close (ns->rtnl_socket); } struct rtnl_thread_exec { int fd; void *(*fn) (void *); void *arg; void **ret; }; static void * rtnl_exec_in_thread_fn (void *p) { struct rtnl_thread_exec *ex = (struct rtnl_thread_exec *) p; if (setns (ex->fd, 0)) return (void *) ((uword) (-errno)); *ex->ret = ex->fn (ex->arg); return NULL; } static int rtnl_exec_in_namespace_byfd (int fd, void *(*fn) (void *), void *arg, void **ret) { pthread_t thread; void *thread_ret; struct rtnl_thread_exec ex = { .fd = fd, .fn = fn, .arg = arg, .ret = ret }; if (pthread_create (&thread, NULL, rtnl_exec_in_thread_fn, &ex)) return -errno; if (pthread_join (thread, &thread_ret)) return -errno; if (thread_ret) return (int) ((uword) thread_ret); return 0; } int rtnl_exec_in_namespace (u32 stream_index, void *(*fn) (void *), void *arg, void **ret) { rtnl_main_t *rm = &rtnl_main; if (pool_is_free_index (rm->streams, stream_index)) return -EBADR; rtnl_ns_t *ns = pool_elt_at_index (rm->streams, stream_index); return rtnl_exec_in_namespace_byfd (ns->ns_fd, fn, arg, ret); } int rtnl_exec_in_namespace_by_name (char *nsname, void *(*fn) (void *), void *arg, void **ret) { int fd; u8 *s = format ((u8 *) 0, "%U", format_rtnl_nsname2path, nsname); if ((fd = open ((char *) s, O_RDONLY)) < 0) { vec_free (s); return -errno; } int r = rtnl_exec_in_namespace_byfd (fd, fn, arg, ret); vec_free (s); close (fd); return r; } /* this function is run by the second thread */ static void * rtnl_thread_fn (void *p) { rtnl_ns_t *ns = (rtnl_ns_t *) p; if (setns (ns->ns_fd, 0)) { clib_warning ("setns(%d, %d) error %d", ns->ns_fd, CLONE_NEWNET, errno); return (void *) -1; } if ((ns->rtnl_socket = socket (AF_NETLINK, SOCK_RAW, NETLINK_ROUTE)) == -1) { clib_warning ("Cannot open socket"); return (void *) -2; } return NULL; } static int rtnl_socket_open (rtnl_ns_t *ns) { rtnl_main_t *rm = &rtnl_main; pthread_t thread; void *thread_ret; if (pthread_create (&thread, NULL, rtnl_thread_fn, ns)) { clib_warning ("Can't create opening thread"); return -1; } if (pthread_join (thread, &thread_ret)) { clib_warning ("Can't join opening thread"); return -2; } if (thread_ret) { clib_warning ("Could not open netlink socket"); return -3; } struct sockaddr_nl addr = { .nl_family = AF_NETLINK, .nl_pad = 0, .nl_pid = 0, /*add mpls message group*/ .nl_groups = grpmask (RTNLGRP_LINK) | grpmask (RTNLGRP_IPV6_IFADDR) | grpmask (RTNLGRP_IPV4_IFADDR) | grpmask (RTNLGRP_IPV4_ROUTE) | grpmask (RTNLGRP_IPV6_ROUTE) | grpmask (RTNLGRP_NEIGH) | grpmask (RTNLGRP_NOTIFY) /* | grpmask(RTNLGRP_MPLS_ROUTE)*/, }; if (bind (ns->rtnl_socket, (struct sockaddr *) &addr, sizeof (addr))) { close (ns->rtnl_socket); return -3; } clib_file_t template = { 0 }; template.read_function = rtnl_read_cb; template.file_descriptor = ns->rtnl_socket; template.private_data = (uword) (ns - rm->streams); ns->unix_index = clib_file_add (&file_main, &template); return 0; } static int rtnl_rcv_error (rtnl_ns_t *ns, struct nlmsghdr *hdr, int *error) { struct nlmsgerr *err = NLMSG_DATA (hdr); size_t datalen = hdr->nlmsg_len - NLMSG_ALIGN (sizeof (*hdr)); if (datalen < sizeof (*err)) return -1; *error = err->error; return 0; } static void rtnl_sync_reset (rtnl_ns_t *ns) { if (ns->sync_state == RTNL_SS_OPENING) return; rtnl_socket_close (ns); ns->sync_state = RTNL_SS_OPENING; } static void rtnl_sync_done (rtnl_ns_t *ns) { rtnl_main_t *rm = &rtnl_main; struct ifaddrmsg addrmsg; struct rtmsg rtmsg; struct ndmsg ndmsg; switch (ns->sync_state) { case RTNL_SS_OPENING: // Cannot happen here break; case RTNL_SS_LINK: memset (&addrmsg, 0, sizeof (addrmsg)); addrmsg.ifa_family = AF_UNSPEC; if (rtnl_dump_request (ns, RTM_GETADDR, &addrmsg, sizeof (addrmsg))) { rtnl_sync_reset (ns); rtnl_schedule_timeout (ns, rm->now + 1); return; } rtnl_schedule_timeout (ns, rm->now + RTNL_DUMP_TIMEOUT); ns->sync_state = RTNL_SS_ADDR; break; case RTNL_SS_ADDR: case RTNL_SS_ROUTE4: memset (&rtmsg, 0, sizeof (rtmsg)); rtmsg.rtm_family = (ns->sync_state == RTNL_SS_ADDR) ? AF_INET : AF_INET6; rtmsg.rtm_table = RT_TABLE_UNSPEC; if (rtnl_dump_request (ns, RTM_GETROUTE, &rtmsg, sizeof (rtmsg))) { rtnl_sync_reset (ns); rtnl_schedule_timeout (ns, rm->now + 1); return; } rtnl_schedule_timeout (ns, rm->now + RTNL_DUMP_TIMEOUT); ns->sync_state = (ns->sync_state == RTNL_SS_ADDR) ? RTNL_SS_ROUTE4 : RTNL_SS_ROUTE6; break; case RTNL_SS_ROUTE6: memset (&ndmsg, 0, sizeof (ndmsg)); ndmsg.ndm_family = AF_UNSPEC; if (rtnl_dump_request (ns, RTM_GETNEIGH, &ndmsg, sizeof (ndmsg))) { rtnl_sync_reset (ns); rtnl_schedule_timeout (ns, rm->now + 1); return; } rtnl_schedule_timeout (ns, rm->now + RTNL_DUMP_TIMEOUT); ns->sync_state = RTNL_SS_NEIGH; break; case RTNL_SS_NEIGH: ns->state = RTNL_S_READY; ns->sync_state = 0; rtnl_cancel_timeout (ns); break; } } static void rtnl_sync_timeout (rtnl_ns_t *ns) { rtnl_main_t *rm = &rtnl_main; struct ifinfomsg imsg = {}; switch (ns->sync_state) { case RTNL_SS_OPENING: if (rtnl_socket_open (ns)) { rtnl_schedule_timeout (ns, rm->now + 10); return; } imsg.ifi_family = AF_UNSPEC; if (rtnl_dump_request (ns, RTM_GETLINK, &imsg, sizeof (imsg))) { rtnl_sync_reset (ns); rtnl_schedule_timeout (ns, rm->now + 10); } ns->sync_state = RTNL_SS_LINK; rtnl_schedule_timeout (ns, rm->now + 2); break; case RTNL_SS_LINK: case RTNL_SS_ADDR: case RTNL_SS_ROUTE4: case RTNL_SS_ROUTE6: case RTNL_SS_NEIGH: // Timeout happened while synchronizing rtnl_sync_reset (ns); rtnl_schedule_timeout (ns, rm->now + 1); break; } } static int rtnl_ns_recv (rtnl_ns_t *ns, struct nlmsghdr *hdr) { rtnl_main_t *rm = &rtnl_main; int ret, error = 0; if (ns->state == RTNL_S_SYNC && ((hdr->nlmsg_flags & RTM_F_NOTIFY) || (hdr->nlmsg_seq != (ns->rtnl_seq)))) { clib_warning ( "Received notification while in sync. Restart synchronization."); rtnl_sync_reset (ns); rtnl_schedule_timeout (ns, rm->now); } switch (hdr->nlmsg_type) { case NLMSG_DONE: rtnl_sync_done (ns); break; case NLMSG_ERROR: if ((ret = rtnl_rcv_error (ns, hdr, &error))) return ret; break; case RTM_NEWROUTE: case RTM_DELROUTE: case RTM_NEWLINK: case RTM_DELLINK: case RTM_NEWADDR: case RTM_DELADDR: case RTM_NEWNEIGH: case RTM_DELNEIGH: if (ns->stream.recv_message) ns->stream.recv_message (hdr, ns->stream.opaque); break; default: clib_warning ("Unknown rtnetlink type %d", hdr->nlmsg_type); break; } return 0; } static void rtnl_process_open (rtnl_ns_t *ns) { rtnl_main_t *rm = &rtnl_main; if (ns->state != RTNL_S_INIT) return; ns->state = RTNL_S_SYNC; ns->sync_state = RTNL_SS_OPENING; rtnl_schedule_timeout (ns, rm->now); } static void rtnl_process_close (rtnl_ns_t *ns) { rtnl_main_t *rm = &rtnl_main; if (ns->state == RTNL_S_INIT) return; rtnl_socket_close (ns); close (ns->ns_fd); pool_put (rm->streams, ns); } static int rtnl_process_read (rtnl_ns_t *ns) { uint8_t buff[RTNL_BUFFSIZ]; ssize_t len; struct nlmsghdr *hdr; while (1) { if ((len = recv (ns->rtnl_socket, buff, RTNL_BUFFSIZ, MSG_DONTWAIT)) < 0) { if (errno != EAGAIN) { clib_warning ("rtnetlink recv error (%d) [%s]: %s", ns->rtnl_socket, ns->stream.name, strerror (errno)); return -1; } return 0; } for (hdr = (struct nlmsghdr *) buff; len > 0; len -= NLMSG_ALIGN (hdr->nlmsg_len), hdr = (struct nlmsghdr *) (((uint8_t *) hdr) + NLMSG_ALIGN (hdr->nlmsg_len))) { if ((sizeof (*hdr) > (size_t) len) || (hdr->nlmsg_len > (size_t) len)) { clib_warning ("rtnetlink buffer too small (%d Vs %d)", (int) hdr->nlmsg_len, (int) len); return -1; } if (rtnl_ns_recv (ns, hdr)) return -1; } } return 0; } static void rtnl_process_timeout (rtnl_ns_t *ns) { switch (ns->state) { case RTNL_S_SYNC: rtnl_sync_timeout (ns); break; case RTNL_S_INIT: case RTNL_S_READY: clib_warning ("Should not happen"); break; } } static uword rtnl_process (vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame) { rtnl_main_t *rm = &rtnl_main; uword event_type; uword *event_data = 0; rm->now = vlib_time_now (vm); f64 timeout = DBL_MAX; rtnl_ns_t *ns; // Setting up while (1) { vlib_process_wait_for_event_or_clock (vm, timeout - rm->now); event_type = vlib_process_get_events (vm, &event_data); rm->now = vlib_time_now (vm); if (event_type == ~0) { // Clock event or no event pool_foreach (ns, rm->streams, { if (ns->timeout < rm->now) { ns->timeout = DBL_MAX; rtnl_process_timeout (ns); } }) ; } else { rtnl_ns_t *ns; uword *d; vec_foreach (d, event_data) { ns = &rm->streams[d[0]]; switch (event_type) { case RTNL_E_CLOSE: rtnl_process_close (ns); break; case RTNL_E_OPEN: rtnl_process_open (ns); break; case RTNL_E_READ: rtnl_process_read (ns); break; } } } vec_reset_length (event_data); timeout = DBL_MAX; pool_foreach (ns, rm->streams, { if (ns->timeout < timeout) timeout = ns->timeout; }) ; } return frame->n_vectors; } VLIB_REGISTER_NODE (rtnl_process_node, static) = { .function = rtnl_process, .name = "rtnl-process", .type = VLIB_NODE_TYPE_PROCESS, }; u32 rtnl_stream_open (rtnl_stream_t *template) { vlib_main_t *vm = vlib_get_main (); rtnl_main_t *rm = &rtnl_main; rtnl_ns_t *ns; int fd; u8 *s = format ((u8 *) 0, "%U", format_rtnl_nsname2path, template->name); vec_add1 (s, 0); if ((fd = open ((char *) s, O_RDONLY)) < 0) { clib_unix_warning ("open stream %s: ", s); vec_free (s); return ~0; } vec_free (s); pool_get (rm->streams, ns); ns->state = RTNL_S_INIT; ns->ns_fd = fd; ns->stream = *template; vlib_process_signal_event (vm, rtnl_process_node.index, RTNL_E_OPEN, (uword) (ns - rm->streams)); return ns - rm->streams; } void rtnl_stream_close (u32 stream_index) { vlib_main_t *vm = vlib_get_main (); rtnl_main_t *rm = &rtnl_main; ASSERT (!pool_is_free_index (rm->streams, stream_index)); vlib_process_signal_event (vm, rtnl_process_node.index, RTNL_E_CLOSE, stream_index); } clib_error_t * rtnl_init (vlib_main_t *vm) { rtnl_main_t *rm = &rtnl_main; rm->streams = 0; return 0; } VLIB_INIT_FUNCTION (rtnl_init);