From 3b64d6334b4e8d0759cff043a55042f88d1ccb0e Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Fri, 8 Sep 2017 12:26:12 +0200 Subject: vlib: move linux-specific code to vlib/linux Change-Id: Id79d2c2be7a98e15416a537c890a8f2dd6d4464d Signed-off-by: Damjan Marion --- src/plugins/dpdk/device/init.c | 1 + src/plugins/memif/memif.c | 1 + src/plugins/memif/private.h | 30 -- src/vlib.am | 7 +- src/vlib/linux/pci.c | 666 +++++++++++++++++++++++++++++++++ src/vlib/linux/physmem.c | 411 ++++++++++++++++++++ src/vlib/linux/syscall.h | 58 +++ src/vlib/linux/sysfs.c | 250 +++++++++++++ src/vlib/linux/sysfs.h | 44 +++ src/vlib/pci/linux_pci.c | 665 -------------------------------- src/vlib/threads_cli.c | 1 + src/vlib/unix/physmem.c | 439 ---------------------- src/vlib/unix/unix.h | 17 - src/vlib/unix/util.c | 219 ----------- src/vnet/devices/af_packet/af_packet.c | 1 + 15 files changed, 1438 insertions(+), 1372 deletions(-) create mode 100644 src/vlib/linux/pci.c create mode 100644 src/vlib/linux/physmem.c create mode 100644 src/vlib/linux/syscall.h create mode 100644 src/vlib/linux/sysfs.c create mode 100644 src/vlib/linux/sysfs.h delete mode 100644 src/vlib/pci/linux_pci.c delete mode 100644 src/vlib/unix/physmem.c diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index e23542f7554..4ef3b676244 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include diff --git a/src/plugins/memif/memif.c b/src/plugins/memif/memif.c index 7e2d947f426..4c387b92f81 100644 --- a/src/plugins/memif/memif.c +++ b/src/plugins/memif/memif.c @@ -33,6 +33,7 @@ #include #include +#include #include #include #include diff --git a/src/plugins/memif/private.h b/src/plugins/memif/private.h index 985ac5ec985..b5f2f8ff55f 100644 --- a/src/plugins/memif/private.h +++ b/src/plugins/memif/private.h @@ -228,24 +228,6 @@ int memif_create_if (vlib_main_t * vm, memif_create_if_args_t * args); int memif_delete_if (vlib_main_t * vm, memif_if_t * mif); clib_error_t *memif_plugin_api_hookup (vlib_main_t * vm); -#ifndef __NR_memfd_create -#if defined __x86_64__ -#define __NR_memfd_create 319 -#elif defined __arm__ -#define __NR_memfd_create 385 -#elif defined __aarch64__ -#define __NR_memfd_create 279 -#else -#error "__NR_memfd_create unknown for this architecture" -#endif -#endif - -static inline int -memfd_create (const char *name, unsigned int flags) -{ - return syscall (__NR_memfd_create, name, flags); -} - static_always_inline void * memif_get_buffer (memif_if_t * mif, memif_ring_t * ring, u16 slot) { @@ -253,18 +235,6 @@ memif_get_buffer (memif_if_t * mif, memif_ring_t * ring, u16 slot) return mif->regions[region].shm + ring->desc[slot].offset; } -#ifndef F_LINUX_SPECIFIC_BASE -#define F_LINUX_SPECIFIC_BASE 1024 -#endif -#define MFD_ALLOW_SEALING 0x0002U -#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) -#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) - -#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ -#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ -#define F_SEAL_GROW 0x0004 /* prevent file from growing */ -#define F_SEAL_WRITE 0x0008 /* prevent writes */ - /* memif.c */ clib_error_t *memif_init_regions_and_queues (memif_if_t * mif); clib_error_t *memif_connect (memif_if_t * mif); diff --git a/src/vlib.am b/src/vlib.am index cab90e2d034..41d686903cd 100644 --- a/src/vlib.am +++ b/src/vlib.am @@ -32,13 +32,15 @@ libvlib_la_SOURCES = \ vlib/format.c \ vlib/i2c.c \ vlib/init.c \ + vlib/linux/pci.c \ + vlib/linux/physmem.c \ + vlib/linux/sysfs.c \ vlib/main.c \ vlib/mc.c \ vlib/node.c \ vlib/node_cli.c \ vlib/node_format.c \ vlib/pci/pci.c \ - vlib/pci/linux_pci.c \ vlib/threads.c \ vlib/threads_cli.c \ vlib/trace.c @@ -58,6 +60,8 @@ nobase_include_HEADERS += \ vlib/global_funcs.h \ vlib/i2c.h \ vlib/init.h \ + vlib/linux/sysfs.h \ + vlib/linux/syscall.h \ vlib/main.h \ vlib/mc.h \ vlib/node_funcs.h \ @@ -79,7 +83,6 @@ libvlib_la_SOURCES += \ vlib/unix/mc_socket.c \ vlib/unix/plugin.c \ vlib/unix/plugin.h \ - vlib/unix/physmem.c \ vlib/unix/util.c nobase_include_HEADERS += \ diff --git a/src/vlib/linux/pci.c b/src/vlib/linux/pci.c new file mode 100644 index 00000000000..cd2affdc903 --- /dev/null +++ b/src/vlib/linux/pci.c @@ -0,0 +1,666 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * pci.c: Linux user space PCI bus management. + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef struct +{ + /* /sys/bus/pci/devices/... directory name for this device. */ + u8 *dev_dir_name; + + /* Resource file descriptors. */ + int *resource_fds; + + /* File descriptor for config space read/write. */ + int config_fd; + + /* File descriptor for /dev/uio%d */ + int uio_fd; + + /* Minor device for uio device. */ + u32 uio_minor; + + /* Index given by unix_file_add. */ + u32 unix_file_index; + +} linux_pci_device_t; + +/* Pool of PCI devices. */ +typedef struct +{ + vlib_main_t *vlib_main; + linux_pci_device_t *linux_pci_devices; +} linux_pci_main_t; + +extern linux_pci_main_t linux_pci_main; + +/* Call to allocate/initialize the pci subsystem. + This is not an init function so that users can explicitly enable + pci only when it's needed. */ +clib_error_t *pci_bus_init (vlib_main_t * vm); + +clib_error_t *vlib_pci_bind_to_uio (vlib_pci_device_t * d, + char *uio_driver_name); + +linux_pci_main_t linux_pci_main; + +clib_error_t * +vlib_pci_bind_to_uio (vlib_pci_device_t * d, char *uio_driver_name) +{ + clib_error_t *error = 0; + u8 *s = 0, *driver_name = 0; + DIR *dir = 0; + struct dirent *e; + int fd, clear_driver_override = 0; + u8 *dev_dir_name = format (0, "/sys/bus/pci/devices/%U", + format_vlib_pci_addr, &d->bus_address); + + s = format (s, "%v/driver%c", dev_dir_name, 0); + driver_name = vlib_sysfs_link_to_name ((char *) s); + vec_reset_length (s); + + if (driver_name && + ((strcmp ("vfio-pci", (char *) driver_name) == 0) || + (strcmp ("uio_pci_generic", (char *) driver_name) == 0) || + (strcmp ("igb_uio", (char *) driver_name) == 0))) + goto done; + + /* walk trough all linux interfaces and if interface belonging to + this device is founf check if interface is admin up */ + dir = opendir ("/sys/class/net"); + s = format (s, "%U%c", format_vlib_pci_addr, &d->bus_address, 0); + + if (!dir) + { + error = clib_error_return (0, "Skipping PCI device %U: failed to " + "read /sys/class/net", + format_vlib_pci_addr, &d->bus_address); + goto done; + } + + fd = socket (PF_INET, SOCK_DGRAM, 0); + if (fd < 0) + { + error = clib_error_return_unix (0, "socket"); + goto done; + } + + while ((e = readdir (dir))) + { + struct ifreq ifr; + struct ethtool_drvinfo drvinfo; + + if (e->d_name[0] == '.') /* skip . and .. */ + continue; + + memset (&ifr, 0, sizeof ifr); + memset (&drvinfo, 0, sizeof drvinfo); + ifr.ifr_data = (char *) &drvinfo; + strncpy (ifr.ifr_name, e->d_name, IFNAMSIZ - 1); + drvinfo.cmd = ETHTOOL_GDRVINFO; + if (ioctl (fd, SIOCETHTOOL, &ifr) < 0) + { + /* Some interfaces (eg "lo") don't support this ioctl */ + if ((errno != ENOTSUP) && (errno != ENODEV)) + clib_unix_warning ("ioctl fetch intf %s bus info error", + e->d_name); + continue; + } + + if (strcmp ((char *) s, drvinfo.bus_info)) + continue; + + memset (&ifr, 0, sizeof (ifr)); + strncpy (ifr.ifr_name, e->d_name, IFNAMSIZ - 1); + if (ioctl (fd, SIOCGIFFLAGS, &ifr) < 0) + { + error = clib_error_return_unix (0, "ioctl fetch intf %s flags", + e->d_name); + close (fd); + goto done; + } + + if (ifr.ifr_flags & IFF_UP) + { + error = clib_error_return (0, "Skipping PCI device %U as host " + "interface %s is up", + format_vlib_pci_addr, &d->bus_address, + e->d_name); + close (fd); + goto done; + } + } + + close (fd); + vec_reset_length (s); + + s = format (s, "%v/driver/unbind%c", dev_dir_name, 0); + vlib_sysfs_write ((char *) s, "%U", format_vlib_pci_addr, &d->bus_address); + vec_reset_length (s); + + s = format (s, "%v/driver_override%c", dev_dir_name, 0); + if (access ((char *) s, F_OK) == 0) + { + vlib_sysfs_write ((char *) s, "%s", uio_driver_name); + clear_driver_override = 1; + } + else + { + vec_reset_length (s); + s = format (s, "/sys/bus/pci/drivers/%s/new_id%c", uio_driver_name, 0); + vlib_sysfs_write ((char *) s, "0x%04x 0x%04x", d->vendor_id, + d->device_id); + } + vec_reset_length (s); + + s = format (s, "/sys/bus/pci/drivers/%s/bind%c", uio_driver_name, 0); + vlib_sysfs_write ((char *) s, "%U", format_vlib_pci_addr, &d->bus_address); + vec_reset_length (s); + + if (clear_driver_override) + { + s = format (s, "%v/driver_override%c", dev_dir_name, 0); + vlib_sysfs_write ((char *) s, "%c", 0); + vec_reset_length (s); + } + +done: + closedir (dir); + vec_free (s); + vec_free (dev_dir_name); + vec_free (driver_name); + return error; +} + + +static clib_error_t * +scan_uio_dir (void *arg, u8 * path_name, u8 * file_name) +{ + linux_pci_device_t *l = arg; + unformat_input_t input; + + unformat_init_string (&input, (char *) file_name, vec_len (file_name)); + + if (!unformat (&input, "uio%d", &l->uio_minor)) + abort (); + + unformat_free (&input); + return 0; +} + +static clib_error_t * +linux_pci_uio_read_ready (unix_file_t * uf) +{ + vlib_pci_main_t *pm = &pci_main; + vlib_pci_device_t *d; + int __attribute__ ((unused)) rv; + + u32 icount; + rv = read (uf->file_descriptor, &icount, 4); + + d = pool_elt_at_index (pm->pci_devs, uf->private_data); + + if (d->interrupt_handler) + d->interrupt_handler (d); + + vlib_pci_intr_enable (d); + + return /* no error */ 0; +} + +static clib_error_t * +linux_pci_uio_error_ready (unix_file_t * uf) +{ + u32 error_index = (u32) uf->private_data; + + return clib_error_return (0, "pci device %d: error", error_index); +} + +static void +add_device (vlib_pci_device_t * dev, linux_pci_device_t * pdev) +{ + vlib_pci_main_t *pm = &pci_main; + linux_pci_main_t *lpm = &linux_pci_main; + linux_pci_device_t *l; + + pool_get (lpm->linux_pci_devices, l); + l[0] = pdev[0]; + + l->dev_dir_name = vec_dup (l->dev_dir_name); + + dev->os_handle = l - lpm->linux_pci_devices; + + { + u8 *uio_dir = format (0, "%s/uio", l->dev_dir_name); + foreach_directory_file ((char *) uio_dir, scan_uio_dir, l, /* scan_dirs */ + 1); + vec_free (uio_dir); + } + + { + char *uio_name = (char *) format (0, "/dev/uio%d%c", l->uio_minor, 0); + l->uio_fd = open (uio_name, O_RDWR); + if (l->uio_fd < 0) + clib_unix_error ("open `%s'", uio_name); + vec_free (uio_name); + } + + { + unix_file_t template = { 0 }; + unix_main_t *um = &unix_main; + + template.read_function = linux_pci_uio_read_ready; + template.file_descriptor = l->uio_fd; + template.error_function = linux_pci_uio_error_ready; + template.private_data = dev - pm->pci_devs; + + l->unix_file_index = unix_file_add (um, &template); + } +} + +static void +linux_pci_device_free (linux_pci_device_t * l) +{ + int i; + for (i = 0; i < vec_len (l->resource_fds); i++) + if (l->resource_fds[i] > 0) + close (l->resource_fds[i]); + if (l->config_fd > 0) + close (l->config_fd); + if (l->uio_fd > 0) + close (l->uio_fd); + vec_free (l->resource_fds); + vec_free (l->dev_dir_name); +} + +/* Configuration space read/write. */ +clib_error_t * +vlib_pci_read_write_config (vlib_pci_device_t * dev, + vlib_read_or_write_t read_or_write, + uword address, void *data, u32 n_bytes) +{ + linux_pci_main_t *lpm = &linux_pci_main; + linux_pci_device_t *p; + int n; + + p = pool_elt_at_index (lpm->linux_pci_devices, dev->os_handle); + + if (read_or_write == VLIB_READ) + n = pread (p->config_fd, data, n_bytes, address); + else + n = pwrite (p->config_fd, data, n_bytes, address); + + if (n != n_bytes) + return clib_error_return_unix (0, "%s", + read_or_write == VLIB_READ + ? "read" : "write"); + + return 0; +} + +static clib_error_t * +os_map_pci_resource_internal (uword os_handle, + u32 resource, u8 * addr, void **result) +{ + linux_pci_main_t *pm = &linux_pci_main; + linux_pci_device_t *p; + struct stat stat_buf; + u8 *file_name; + int fd; + clib_error_t *error; + int flags = MAP_SHARED; + + error = 0; + p = pool_elt_at_index (pm->linux_pci_devices, os_handle); + + file_name = format (0, "%v/resource%d%c", p->dev_dir_name, resource, 0); + fd = open ((char *) file_name, O_RDWR); + if (fd < 0) + { + error = clib_error_return_unix (0, "open `%s'", file_name); + goto done; + } + + if (fstat (fd, &stat_buf) < 0) + { + error = clib_error_return_unix (0, "fstat `%s'", file_name); + goto done; + } + + vec_validate (p->resource_fds, resource); + p->resource_fds[resource] = fd; + if (addr != 0) + flags |= MAP_FIXED; + + *result = mmap (addr, + /* size */ stat_buf.st_size, + PROT_READ | PROT_WRITE, flags, + /* file */ fd, + /* offset */ 0); + if (*result == (void *) -1) + { + error = clib_error_return_unix (0, "mmap `%s'", file_name); + goto done; + } + +done: + if (error) + { + if (fd >= 0) + close (fd); + } + vec_free (file_name); + return error; +} + +clib_error_t * +vlib_pci_map_resource (vlib_pci_device_t * dev, u32 resource, void **result) +{ + return (os_map_pci_resource_internal + (dev->os_handle, resource, 0 /* addr */ , + result)); +} + +clib_error_t * +vlib_pci_map_resource_fixed (vlib_pci_device_t * dev, + u32 resource, u8 * addr, void **result) +{ + return (os_map_pci_resource_internal + (dev->os_handle, resource, addr, result)); +} + +void +vlib_pci_free_device (vlib_pci_device_t * dev) +{ + linux_pci_main_t *pm = &linux_pci_main; + linux_pci_device_t *l; + + l = pool_elt_at_index (pm->linux_pci_devices, dev->os_handle); + linux_pci_device_free (l); + pool_put (pm->linux_pci_devices, l); +} + +pci_device_registration_t * __attribute__ ((unused)) +pci_device_next_registered (pci_device_registration_t * r) +{ + uword i; + + /* Null vendor id marks end of initialized list. */ + for (i = 0; r->supported_devices[i].vendor_id != 0; i++) + ; + + return clib_elf_section_data_next (r, i * sizeof (r->supported_devices[0])); +} + +static clib_error_t * +init_device_from_registered (vlib_main_t * vm, + vlib_pci_device_t * dev, + linux_pci_device_t * pdev) +{ + vlib_pci_main_t *pm = &pci_main; + pci_device_registration_t *r; + pci_device_id_t *i; + clib_error_t *error; + + r = pm->pci_device_registrations; + + while (r) + { + for (i = r->supported_devices; i->vendor_id != 0; i++) + if (i->vendor_id == dev->vendor_id && i->device_id == dev->device_id) + { + error = vlib_pci_bind_to_uio (dev, "uio_pci_generic"); + if (error) + { + clib_error_report (error); + continue; + } + + add_device (dev, pdev); + dev->interrupt_handler = r->interrupt_handler; + return r->init_function (vm, dev); + } + r = r->next_registration; + } + /* No driver, close the PCI config-space FD */ + close (pdev->config_fd); + return 0; +} + +static clib_error_t * +init_device (vlib_main_t * vm, + vlib_pci_device_t * dev, linux_pci_device_t * pdev) +{ + return init_device_from_registered (vm, dev, pdev); +} + +static clib_error_t * +scan_device (void *arg, u8 * dev_dir_name, u8 * ignored) +{ + vlib_main_t *vm = arg; + vlib_pci_main_t *pm = &pci_main; + int fd; + u8 *f; + clib_error_t *error = 0; + vlib_pci_device_t *dev; + linux_pci_device_t pdev = { 0 }; + u32 tmp; + + f = format (0, "%v/config%c", dev_dir_name, 0); + fd = open ((char *) f, O_RDWR); + + /* Try read-only access if write fails. */ + if (fd < 0) + fd = open ((char *) f, O_RDONLY); + + if (fd < 0) + { + error = clib_error_return_unix (0, "open `%s'", f); + goto done; + } + + pool_get (pm->pci_devs, dev); + + /* You can only read more that 64 bytes of config space as root; so we try to + read the full space but fall back to just the first 64 bytes. */ + if (read (fd, &dev->config_data, sizeof (dev->config_data)) != + sizeof (dev->config_data) + && read (fd, &dev->config0, + sizeof (dev->config0)) != sizeof (dev->config0)) + { + pool_put (pm->pci_devs, dev); + error = clib_error_return_unix (0, "read `%s'", f); + close (fd); + goto done; + } + + { + static pci_config_header_t all_ones; + if (all_ones.vendor_id == 0) + memset (&all_ones, ~0, sizeof (all_ones)); + + if (!memcmp (&dev->config0.header, &all_ones, sizeof (all_ones))) + { + pool_put (pm->pci_devs, dev); + error = clib_error_return (0, "invalid PCI config for `%s'", f); + close (fd); + goto done; + } + } + + if (dev->config0.header.header_type == 0) + pci_config_type0_little_to_host (&dev->config0); + else + pci_config_type1_little_to_host (&dev->config1); + + /* Parse bus, dev, function from directory name. */ + { + unformat_input_t input; + + unformat_init_string (&input, (char *) dev_dir_name, + vec_len (dev_dir_name)); + + if (!unformat (&input, "/sys/bus/pci/devices/%U", + unformat_vlib_pci_addr, &dev->bus_address)) + abort (); + + unformat_free (&input); + + } + + + pdev.config_fd = fd; + pdev.dev_dir_name = dev_dir_name; + + hash_set (pm->pci_dev_index_by_pci_addr, dev->bus_address.as_u32, + dev - pm->pci_devs); + + vec_reset_length (f); + f = format (f, "%v/vpd%c", dev_dir_name, 0); + fd = open ((char *) f, O_RDONLY); + if (fd >= 0) + { + while (1) + { + u8 tag[3]; + u8 *data = 0; + int len; + + if (read (fd, &tag, 3) != 3) + break; + + if (tag[0] != 0x82 && tag[0] != 0x90 && tag[0] != 0x91) + break; + + len = (tag[2] << 8) | tag[1]; + vec_validate (data, len); + + if (read (fd, data, len) != len) + { + vec_free (data); + break; + } + if (tag[0] == 0x82) + dev->product_name = data; + else if (tag[0] == 0x90) + dev->vpd_r = data; + else if (tag[0] == 0x91) + dev->vpd_w = data; + + data = 0; + } + close (fd); + } + + dev->numa_node = -1; + vec_reset_length (f); + f = format (f, "%v/numa_node%c", dev_dir_name, 0); + vlib_sysfs_read ((char *) f, "%u", &dev->numa_node); + + vec_reset_length (f); + f = format (f, "%v/class%c", dev_dir_name, 0); + vlib_sysfs_read ((char *) f, "0x%x", &tmp); + dev->device_class = tmp >> 8; + + vec_reset_length (f); + f = format (f, "%v/vendor%c", dev_dir_name, 0); + vlib_sysfs_read ((char *) f, "0x%x", &tmp); + dev->vendor_id = tmp; + + vec_reset_length (f); + f = format (f, "%v/device%c", dev_dir_name, 0); + vlib_sysfs_read ((char *) f, "0x%x", &tmp); + dev->device_id = tmp; + + error = init_device (vm, dev, &pdev); + + vec_reset_length (f); + f = format (f, "%v/driver%c", dev_dir_name, 0); + dev->driver_name = vlib_sysfs_link_to_name ((char *) f); + +done: + vec_free (f); + return error; +} + +clib_error_t * +linux_pci_init (vlib_main_t * vm) +{ + vlib_pci_main_t *pm = &pci_main; + clib_error_t *error; + + pm->vlib_main = vm; + + if ((error = vlib_call_init_function (vm, unix_input_init))) + return error; + + ASSERT (sizeof (vlib_pci_addr_t) == sizeof (u32)); + pm->pci_dev_index_by_pci_addr = hash_create (0, sizeof (uword)); + + error = foreach_directory_file ("/sys/bus/pci/devices", scan_device, vm, + /* scan_dirs */ 0); + + /* Complain and continue. might not be root, etc. */ + if (error) + clib_error_report (error); + + return error; +} + +VLIB_INIT_FUNCTION (linux_pci_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/linux/physmem.c b/src/vlib/linux/physmem.c new file mode 100644 index 00000000000..6731295caec --- /dev/null +++ b/src/vlib/linux/physmem.c @@ -0,0 +1,411 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * physmem.c: Unix physical memory + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +static void * +unix_physmem_alloc_aligned (vlib_main_t * vm, vlib_physmem_region_index_t idx, + uword n_bytes, uword alignment) +{ + vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); + uword lo_offset, hi_offset; + uword *to_free = 0; + + if (pr->heap == 0) + return 0; + + /* IO memory is always at least cache aligned. */ + alignment = clib_max (alignment, CLIB_CACHE_LINE_BYTES); + + while (1) + { + mheap_get_aligned (pr->heap, n_bytes, + /* align */ alignment, + /* align offset */ 0, + &lo_offset); + + /* Allocation failed? */ + if (lo_offset == ~0) + break; + + if (pr->flags & VLIB_PHYSMEM_F_FAKE) + break; + + /* Make sure allocation does not span DMA physical chunk boundary. */ + hi_offset = lo_offset + n_bytes - 1; + + if ((lo_offset >> pr->log2_page_size) == + (hi_offset >> pr->log2_page_size)) + break; + + /* Allocation would span chunk boundary, queue it to be freed as soon as + we find suitable chunk. */ + vec_add1 (to_free, lo_offset); + } + + if (to_free != 0) + { + uword i; + for (i = 0; i < vec_len (to_free); i++) + mheap_put (pr->heap, to_free[i]); + vec_free (to_free); + } + + return lo_offset != ~0 ? pr->heap + lo_offset : 0; +} + +static void +unix_physmem_free (vlib_main_t * vm, vlib_physmem_region_index_t idx, void *x) +{ + vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); + /* Return object to region's heap. */ + mheap_put (pr->heap, x - pr->heap); +} + +static u64 +get_page_paddr (int fd, uword addr) +{ + int pagesize = sysconf (_SC_PAGESIZE); + u64 seek, pagemap = 0; + + seek = ((u64) addr / pagesize) * sizeof (u64); + if (lseek (fd, seek, SEEK_SET) != seek) + { + clib_unix_warning ("lseek to 0x%llx", seek); + return 0; + } + if (read (fd, &pagemap, sizeof (pagemap)) != (sizeof (pagemap))) + { + clib_unix_warning ("read ptbits"); + return 0; + } + if ((pagemap & (1ULL << 63)) == 0) + return 0; + + pagemap &= pow2_mask (55); + + return pagemap * pagesize; +} + +static clib_error_t * +unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size, + u8 numa_node, u32 flags, + vlib_physmem_region_index_t * idx) +{ + vlib_physmem_main_t *vpm = &vm->physmem_main; + vlib_physmem_region_t *pr; + clib_error_t *error = 0; + int pagemap_fd = -1; + u8 *mount_dir = 0; + u8 *filename = 0; + struct stat st; + int old_mpol; + int mmap_flags; + struct bitmask *old_mask = numa_allocate_nodemask (); + + if (geteuid () != 0 && (flags & VLIB_PHYSMEM_F_FAKE) == 0) + return clib_error_return (0, "not allowed"); + + pool_get (vpm->regions, pr); + + if ((pr - vpm->regions) >= 256) + { + error = clib_error_return (0, "maximum number of regions reached"); + goto error; + } + + pr->index = pr - vpm->regions; + pr->fd = -1; + pr->flags = flags; + + if (get_mempolicy (&old_mpol, old_mask->maskp, old_mask->size + 1, NULL, 0) + == -1) + { + error = clib_error_return_unix (0, "get_mempolicy"); + goto error; + } + + if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) + { + if ((pagemap_fd = open ((char *) "/proc/self/pagemap", O_RDONLY)) == -1) + { + error = clib_error_return_unix (0, "open '/proc/self/pagemap'"); + goto error; + } + + mount_dir = format (0, "%s/physmem_region%d%c", + vlib_unix_get_runtime_dir (), pr->index, 0); + filename = format (0, "%s/mem%c", mount_dir, 0); + + unlink ((char *) mount_dir); + + error = vlib_unix_recursive_mkdir ((char *) mount_dir); + if (error) + goto error; + + if (mount ("none", (char *) mount_dir, "hugetlbfs", 0, NULL)) + { + error = clib_error_return_unix (0, "mount hugetlb directory '%s'", + mount_dir); + goto error; + } + + if ((pr->fd = open ((char *) filename, O_CREAT | O_RDWR, 0755)) == -1) + { + error = clib_error_return_unix (0, "open"); + goto error; + } + + mmap_flags = MAP_SHARED | MAP_HUGETLB | MAP_LOCKED; + } + else + { + if ((pr->fd = memfd_create (name, MFD_ALLOW_SEALING)) == -1) + return clib_error_return_unix (0, "memfd_create"); + + if ((fcntl (pr->fd, F_ADD_SEALS, F_SEAL_SHRINK)) == -1) + { + error = + clib_error_return_unix (0, "fcntl (F_ADD_SEALS, F_SEAL_SHRINK)"); + goto error; + } + mmap_flags = MAP_SHARED; + } + + if (fstat (pr->fd, &st)) + { + error = clib_error_return_unix (0, "fstat"); + goto error; + } + + pr->log2_page_size = min_log2 (st.st_blksize); + pr->n_pages = ((size - 1) >> pr->log2_page_size) + 1; + size = pr->n_pages * (1 << pr->log2_page_size); + + if ((ftruncate (pr->fd, size)) == -1) + { + error = clib_error_return_unix (0, "ftruncate length: %d", size); + goto error; + } + + if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) + { + error = vlib_sysfs_prealloc_hugepages (numa_node, + 1 << (pr->log2_page_size - 10), + pr->n_pages); + if (error) + goto error; + } + + numa_set_preferred (numa_node); + + pr->mem = mmap (0, size, (PROT_READ | PROT_WRITE), mmap_flags, pr->fd, 0); + + if (pr->mem == MAP_FAILED) + { + pr->mem = 0; + error = clib_error_return_unix (0, "mmap"); + goto error; + } + + if (set_mempolicy (old_mpol, old_mask->maskp, old_mask->size + 1) == -1) + { + error = clib_error_return_unix (0, "set_mempolicy"); + goto error; + } + + pr->size = pr->n_pages << pr->log2_page_size; + pr->page_mask = (1 << pr->log2_page_size) - 1; + pr->numa_node = numa_node; + pr->name = format (0, "%s", name); + + if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) + { + int i; + for (i = 0; i < pr->n_pages; i++) + { + void *ptr = pr->mem + (i << pr->log2_page_size); + int node; + move_pages (0, 1, &ptr, 0, &node, 0); + if (numa_node != node) + { + clib_warning + ("physmem page for region \'%s\' allocated on the wrong" + " numa node (requested %u actual %u)", pr->name, + pr->numa_node, node, i); + break; + } + } + } + + if (flags & VLIB_PHYSMEM_F_INIT_MHEAP) + { + pr->heap = mheap_alloc_with_flags (pr->mem, pr->size, + /* Don't want mheap mmap/munmap with IO memory. */ + MHEAP_FLAG_DISABLE_VM | + MHEAP_FLAG_THREAD_SAFE); + fformat (stdout, "%U", format_mheap, pr->heap, /* verbose */ 1); + } + + if (flags & VLIB_PHYSMEM_F_HAVE_BUFFERS) + { + vlib_buffer_add_mem_range (vm, pointer_to_uword (pr->mem), pr->size); + } + + *idx = pr->index; + + if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) + { + int i; + for (i = 0; i < pr->n_pages; i++) + { + uword vaddr = + pointer_to_uword (pr->mem) + (((u64) i) << pr->log2_page_size); + u64 page_paddr = get_page_paddr (pagemap_fd, vaddr); + vec_add1 (pr->page_table, page_paddr); + } + } + + goto done; + +error: + if (pr->fd > -1) + close (pr->fd); + + if (pr->mem) + munmap (pr->mem, size); + + memset (pr, 0, sizeof (*pr)); + pool_put (vpm->regions, pr); + +done: + if (mount_dir) + { + umount2 ((char *) mount_dir, MNT_DETACH); + rmdir ((char *) mount_dir); + vec_free (mount_dir); + } + numa_free_cpumask (old_mask); + vec_free (filename); + if (pagemap_fd > -1) + close (pagemap_fd); + return error; +} + +static void +unix_physmem_region_free (vlib_main_t * vm, vlib_physmem_region_index_t idx) +{ + vlib_physmem_main_t *vpm = &vm->physmem_main; + vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); + + if (pr->fd > 0) + close (pr->fd); + munmap (pr->mem, pr->size); + vec_free (pr->name); + pool_put (vpm->regions, pr); +} + +clib_error_t * +unix_physmem_init (vlib_main_t * vm) +{ + clib_error_t *error = 0; + + /* Avoid multiple calls. */ + if (vm->os_physmem_alloc_aligned) + return error; + + vm->os_physmem_alloc_aligned = unix_physmem_alloc_aligned; + vm->os_physmem_free = unix_physmem_free; + vm->os_physmem_region_alloc = unix_physmem_region_alloc; + vm->os_physmem_region_free = unix_physmem_region_free; + + return error; +} + +static clib_error_t * +show_physmem (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + vlib_physmem_main_t *vpm = &vm->physmem_main; + vlib_physmem_region_t *pr; + + /* *INDENT-OFF* */ + pool_foreach (pr, vpm->regions, ( + { + vlib_cli_output (vm, "index %u name '%s' page-size %uKB num-pages %d " + "numa-node %u fd %d\n", + pr->index, pr->name, (1 << (pr->log2_page_size -10)), + pr->n_pages, pr->numa_node, pr->fd); + if (pr->heap) + vlib_cli_output (vm, " %U", format_mheap, pr->heap, /* verbose */ 1); + else + vlib_cli_output (vm, " no heap\n"); + })); + /* *INDENT-ON* */ + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_physmem_command, static) = { + .path = "show physmem", + .short_help = "Show physical memory allocation", + .function = show_physmem, +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/linux/syscall.h b/src/vlib/linux/syscall.h new file mode 100644 index 00000000000..9e37997e8a9 --- /dev/null +++ b/src/vlib/linux/syscall.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef included_linux_syscall_h +#define included_linux_syscall_h + +#ifndef __NR_memfd_create +#if defined __x86_64__ +#define __NR_memfd_create 319 +#elif defined __arm__ +#define __NR_memfd_create 385 +#elif defined __aarch64__ +#define __NR_memfd_create 279 +#else +#error "__NR_memfd_create unknown for this architecture" +#endif +#endif + +static inline int +memfd_create (const char *name, unsigned int flags) +{ + return syscall (__NR_memfd_create, name, flags); +} + +#ifndef F_LINUX_SPECIFIC_BASE +#define F_LINUX_SPECIFIC_BASE 1024 +#endif +#define MFD_ALLOW_SEALING 0x0002U +#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) +#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) + +#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ +#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ +#define F_SEAL_GROW 0x0004 /* prevent file from growing */ +#define F_SEAL_WRITE 0x0008 /* prevent writes */ + + +#endif /* included_linux_syscall_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/linux/sysfs.c b/src/vlib/linux/sysfs.c new file mode 100644 index 00000000000..f92f9ef564e --- /dev/null +++ b/src/vlib/linux/sysfs.c @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include + +clib_error_t * +vlib_sysfs_write (char *file_name, char *fmt, ...) +{ + u8 *s; + int fd; + clib_error_t *error = 0; + + fd = open (file_name, O_WRONLY); + if (fd < 0) + return clib_error_return_unix (0, "open `%s'", file_name); + + va_list va; + va_start (va, fmt); + s = va_format (0, fmt, &va); + va_end (va); + + if (write (fd, s, vec_len (s)) < 0) + error = clib_error_return_unix (0, "write `%s'", file_name); + + vec_free (s); + close (fd); + return error; +} + +clib_error_t * +vlib_sysfs_read (char *file_name, char *fmt, ...) +{ + unformat_input_t input; + u8 *s = 0; + int fd; + ssize_t sz; + uword result; + + fd = open (file_name, O_RDONLY); + if (fd < 0) + return clib_error_return_unix (0, "open `%s'", file_name); + + vec_validate (s, 4095); + + sz = read (fd, s, vec_len (s)); + if (sz < 0) + { + close (fd); + vec_free (s); + return clib_error_return_unix (0, "read `%s'", file_name); + } + + _vec_len (s) = sz; + unformat_init_vector (&input, s); + + va_list va; + va_start (va, fmt); + result = va_unformat (&input, fmt, &va); + va_end (va); + + vec_free (s); + close (fd); + + if (result == 0) + return clib_error_return (0, "unformat error"); + + return 0; +} + +u8 * +vlib_sysfs_link_to_name (char *link) +{ + char *p, buffer[64]; + unformat_input_t in; + u8 *s = 0; + int r; + + r = readlink (link, buffer, sizeof (buffer) - 1); + + if (r < 0) + return 0; + + buffer[r] = 0; + p = strrchr (buffer, '/'); + + if (!p) + return 0; + + unformat_init_string (&in, p + 1, strlen (p + 1)); + if (unformat (&in, "%s", &s) != 1) + clib_unix_warning ("no string?"); + unformat_free (&in); + + return s; +} + +clib_error_t * +vlib_sysfs_set_nr_hugepages (unsigned int numa_node, int page_size, int nr) +{ + clib_error_t *error = 0; + struct stat sb; + u8 *p = 0; + + p = format (p, "/sys/devices/system/node/node%u%c", numa_node, 0); + + if (stat ((char *) p, &sb) == 0) + { + if (S_ISDIR (sb.st_mode) == 0) + { + error = clib_error_return (0, "'%s' is not directory", p); + goto done; + } + } + else if (numa_node == 0) + { + vec_reset_length (p); + p = format (p, "/sys/kernel/mm%c", 0); + if (stat ((char *) p, &sb) < 0 || S_ISDIR (sb.st_mode) == 0) + { + error = clib_error_return (0, "'%s' does not exist or it is not " + "directory", p); + goto done; + } + } + else + { + error = clib_error_return (0, "'%s' does not exist", p); + goto done; + } + + _vec_len (p) -= 1; + p = format (p, "/hugepages/hugepages-%ukB/nr_hugepages%c", page_size, 0); + vlib_sysfs_write ((char *) p, "%d", nr); + +done: + vec_free (p); + return error; +} + + +static clib_error_t * +vlib_sysfs_get_xxx_hugepages (char *type, unsigned int numa_node, + int page_size, int *val) +{ + clib_error_t *error = 0; + struct stat sb; + u8 *p = 0; + + p = format (p, "/sys/devices/system/node/node%u%c", numa_node, 0); + + if (stat ((char *) p, &sb) == 0) + { + if (S_ISDIR (sb.st_mode) == 0) + { + error = clib_error_return (0, "'%s' is not directory", p); + goto done; + } + } + else if (numa_node == 0) + { + vec_reset_length (p); + p = format (p, "/sys/kernel/mm%c", 0); + if (stat ((char *) p, &sb) < 0 || S_ISDIR (sb.st_mode) == 0) + { + error = clib_error_return (0, "'%s' does not exist or it is not " + "directory", p); + goto done; + } + } + else + { + error = clib_error_return (0, "'%s' does not exist", p); + goto done; + } + + _vec_len (p) -= 1; + p = format (p, "/hugepages/hugepages-%ukB/%s_hugepages%c", page_size, + type, 0); + error = vlib_sysfs_read ((char *) p, "%d", val); + +done: + vec_free (p); + return error; +} + +clib_error_t * +vlib_sysfs_get_free_hugepages (unsigned int numa_node, int page_size, int *v) +{ + return vlib_sysfs_get_xxx_hugepages ("free", numa_node, page_size, v); +} + +clib_error_t * +vlib_sysfs_get_nr_hugepages (unsigned int numa_node, int page_size, int *v) +{ + return vlib_sysfs_get_xxx_hugepages ("nr", numa_node, page_size, v); +} + +clib_error_t * +vlib_sysfs_get_surplus_hugepages (unsigned int numa_node, int page_size, + int *v) +{ + return vlib_sysfs_get_xxx_hugepages ("surplus", numa_node, page_size, v); +} + +clib_error_t * +vlib_sysfs_prealloc_hugepages (unsigned int numa_node, int page_size, int nr) +{ + clib_error_t *error = 0; + int n, needed; + error = vlib_sysfs_get_free_hugepages (numa_node, page_size, &n); + if (error) + return error; + needed = nr - n; + if (needed <= 0) + return 0; + + error = vlib_sysfs_get_nr_hugepages (numa_node, page_size, &n); + if (error) + return error; + clib_warning ("pre-allocating %u additional %uK hugepages on numa node %u", + needed, page_size, numa_node); + return vlib_sysfs_set_nr_hugepages (numa_node, page_size, n + needed); +} + + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/linux/sysfs.h b/src/vlib/linux/sysfs.h new file mode 100644 index 00000000000..14b71317ffe --- /dev/null +++ b/src/vlib/linux/sysfs.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef included_linux_sysfs_h +#define included_linux_sysfs_h + +clib_error_t *vlib_sysfs_write (char *file_name, char *fmt, ...); + +clib_error_t *vlib_sysfs_read (char *file_name, char *fmt, ...); + +u8 *vlib_sysfs_link_to_name (char *link); + +clib_error_t *vlib_sysfs_set_nr_hugepages (unsigned int numa_node, + int page_size, int nr); +clib_error_t *vlib_sysfs_get_nr_hugepages (unsigned int numa_node, + int page_size, int *v); +clib_error_t *vlib_sysfs_get_free_hugepages (unsigned int numa_node, + int page_size, int *v); +clib_error_t *vlib_sysfs_get_surplus_hugepages (unsigned int numa_node, + int page_size, int *v); +clib_error_t *vlib_sysfs_prealloc_hugepages (unsigned int numa_node, + int page_size, int nr); + +#endif /* included_linux_sysfs_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/pci/linux_pci.c b/src/vlib/pci/linux_pci.c deleted file mode 100644 index 2d3c0a8807d..00000000000 --- a/src/vlib/pci/linux_pci.c +++ /dev/null @@ -1,665 +0,0 @@ -/* - * Copyright (c) 2016 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* - * pci.c: Linux user space PCI bus management. - * - * Copyright (c) 2008 Eliot Dresselhaus - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE - * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION - * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION - * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -typedef struct -{ - /* /sys/bus/pci/devices/... directory name for this device. */ - u8 *dev_dir_name; - - /* Resource file descriptors. */ - int *resource_fds; - - /* File descriptor for config space read/write. */ - int config_fd; - - /* File descriptor for /dev/uio%d */ - int uio_fd; - - /* Minor device for uio device. */ - u32 uio_minor; - - /* Index given by unix_file_add. */ - u32 unix_file_index; - -} linux_pci_device_t; - -/* Pool of PCI devices. */ -typedef struct -{ - vlib_main_t *vlib_main; - linux_pci_device_t *linux_pci_devices; -} linux_pci_main_t; - -extern linux_pci_main_t linux_pci_main; - -/* Call to allocate/initialize the pci subsystem. - This is not an init function so that users can explicitly enable - pci only when it's needed. */ -clib_error_t *pci_bus_init (vlib_main_t * vm); - -clib_error_t *vlib_pci_bind_to_uio (vlib_pci_device_t * d, - char *uio_driver_name); - -linux_pci_main_t linux_pci_main; - -clib_error_t * -vlib_pci_bind_to_uio (vlib_pci_device_t * d, char *uio_driver_name) -{ - clib_error_t *error = 0; - u8 *s = 0, *driver_name = 0; - DIR *dir = 0; - struct dirent *e; - int fd, clear_driver_override = 0; - u8 *dev_dir_name = format (0, "/sys/bus/pci/devices/%U", - format_vlib_pci_addr, &d->bus_address); - - s = format (s, "%v/driver%c", dev_dir_name, 0); - driver_name = vlib_sysfs_link_to_name ((char *) s); - vec_reset_length (s); - - if (driver_name && - ((strcmp ("vfio-pci", (char *) driver_name) == 0) || - (strcmp ("uio_pci_generic", (char *) driver_name) == 0) || - (strcmp ("igb_uio", (char *) driver_name) == 0))) - goto done; - - /* walk trough all linux interfaces and if interface belonging to - this device is founf check if interface is admin up */ - dir = opendir ("/sys/class/net"); - s = format (s, "%U%c", format_vlib_pci_addr, &d->bus_address, 0); - - if (!dir) - { - error = clib_error_return (0, "Skipping PCI device %U: failed to " - "read /sys/class/net", - format_vlib_pci_addr, &d->bus_address); - goto done; - } - - fd = socket (PF_INET, SOCK_DGRAM, 0); - if (fd < 0) - { - error = clib_error_return_unix (0, "socket"); - goto done; - } - - while ((e = readdir (dir))) - { - struct ifreq ifr; - struct ethtool_drvinfo drvinfo; - - if (e->d_name[0] == '.') /* skip . and .. */ - continue; - - memset (&ifr, 0, sizeof ifr); - memset (&drvinfo, 0, sizeof drvinfo); - ifr.ifr_data = (char *) &drvinfo; - strncpy (ifr.ifr_name, e->d_name, IFNAMSIZ - 1); - drvinfo.cmd = ETHTOOL_GDRVINFO; - if (ioctl (fd, SIOCETHTOOL, &ifr) < 0) - { - /* Some interfaces (eg "lo") don't support this ioctl */ - if ((errno != ENOTSUP) && (errno != ENODEV)) - clib_unix_warning ("ioctl fetch intf %s bus info error", - e->d_name); - continue; - } - - if (strcmp ((char *) s, drvinfo.bus_info)) - continue; - - memset (&ifr, 0, sizeof (ifr)); - strncpy (ifr.ifr_name, e->d_name, IFNAMSIZ - 1); - if (ioctl (fd, SIOCGIFFLAGS, &ifr) < 0) - { - error = clib_error_return_unix (0, "ioctl fetch intf %s flags", - e->d_name); - close (fd); - goto done; - } - - if (ifr.ifr_flags & IFF_UP) - { - error = clib_error_return (0, "Skipping PCI device %U as host " - "interface %s is up", - format_vlib_pci_addr, &d->bus_address, - e->d_name); - close (fd); - goto done; - } - } - - close (fd); - vec_reset_length (s); - - s = format (s, "%v/driver/unbind%c", dev_dir_name, 0); - vlib_sysfs_write ((char *) s, "%U", format_vlib_pci_addr, &d->bus_address); - vec_reset_length (s); - - s = format (s, "%v/driver_override%c", dev_dir_name, 0); - if (access ((char *) s, F_OK) == 0) - { - vlib_sysfs_write ((char *) s, "%s", uio_driver_name); - clear_driver_override = 1; - } - else - { - vec_reset_length (s); - s = format (s, "/sys/bus/pci/drivers/%s/new_id%c", uio_driver_name, 0); - vlib_sysfs_write ((char *) s, "0x%04x 0x%04x", d->vendor_id, - d->device_id); - } - vec_reset_length (s); - - s = format (s, "/sys/bus/pci/drivers/%s/bind%c", uio_driver_name, 0); - vlib_sysfs_write ((char *) s, "%U", format_vlib_pci_addr, &d->bus_address); - vec_reset_length (s); - - if (clear_driver_override) - { - s = format (s, "%v/driver_override%c", dev_dir_name, 0); - vlib_sysfs_write ((char *) s, "%c", 0); - vec_reset_length (s); - } - -done: - closedir (dir); - vec_free (s); - vec_free (dev_dir_name); - vec_free (driver_name); - return error; -} - - -static clib_error_t * -scan_uio_dir (void *arg, u8 * path_name, u8 * file_name) -{ - linux_pci_device_t *l = arg; - unformat_input_t input; - - unformat_init_string (&input, (char *) file_name, vec_len (file_name)); - - if (!unformat (&input, "uio%d", &l->uio_minor)) - abort (); - - unformat_free (&input); - return 0; -} - -static clib_error_t * -linux_pci_uio_read_ready (unix_file_t * uf) -{ - vlib_pci_main_t *pm = &pci_main; - vlib_pci_device_t *d; - int __attribute__ ((unused)) rv; - - u32 icount; - rv = read (uf->file_descriptor, &icount, 4); - - d = pool_elt_at_index (pm->pci_devs, uf->private_data); - - if (d->interrupt_handler) - d->interrupt_handler (d); - - vlib_pci_intr_enable (d); - - return /* no error */ 0; -} - -static clib_error_t * -linux_pci_uio_error_ready (unix_file_t * uf) -{ - u32 error_index = (u32) uf->private_data; - - return clib_error_return (0, "pci device %d: error", error_index); -} - -static void -add_device (vlib_pci_device_t * dev, linux_pci_device_t * pdev) -{ - vlib_pci_main_t *pm = &pci_main; - linux_pci_main_t *lpm = &linux_pci_main; - linux_pci_device_t *l; - - pool_get (lpm->linux_pci_devices, l); - l[0] = pdev[0]; - - l->dev_dir_name = vec_dup (l->dev_dir_name); - - dev->os_handle = l - lpm->linux_pci_devices; - - { - u8 *uio_dir = format (0, "%s/uio", l->dev_dir_name); - foreach_directory_file ((char *) uio_dir, scan_uio_dir, l, /* scan_dirs */ - 1); - vec_free (uio_dir); - } - - { - char *uio_name = (char *) format (0, "/dev/uio%d%c", l->uio_minor, 0); - l->uio_fd = open (uio_name, O_RDWR); - if (l->uio_fd < 0) - clib_unix_error ("open `%s'", uio_name); - vec_free (uio_name); - } - - { - unix_file_t template = { 0 }; - unix_main_t *um = &unix_main; - - template.read_function = linux_pci_uio_read_ready; - template.file_descriptor = l->uio_fd; - template.error_function = linux_pci_uio_error_ready; - template.private_data = dev - pm->pci_devs; - - l->unix_file_index = unix_file_add (um, &template); - } -} - -static void -linux_pci_device_free (linux_pci_device_t * l) -{ - int i; - for (i = 0; i < vec_len (l->resource_fds); i++) - if (l->resource_fds[i] > 0) - close (l->resource_fds[i]); - if (l->config_fd > 0) - close (l->config_fd); - if (l->uio_fd > 0) - close (l->uio_fd); - vec_free (l->resource_fds); - vec_free (l->dev_dir_name); -} - -/* Configuration space read/write. */ -clib_error_t * -vlib_pci_read_write_config (vlib_pci_device_t * dev, - vlib_read_or_write_t read_or_write, - uword address, void *data, u32 n_bytes) -{ - linux_pci_main_t *lpm = &linux_pci_main; - linux_pci_device_t *p; - int n; - - p = pool_elt_at_index (lpm->linux_pci_devices, dev->os_handle); - - if (read_or_write == VLIB_READ) - n = pread (p->config_fd, data, n_bytes, address); - else - n = pwrite (p->config_fd, data, n_bytes, address); - - if (n != n_bytes) - return clib_error_return_unix (0, "%s", - read_or_write == VLIB_READ - ? "read" : "write"); - - return 0; -} - -static clib_error_t * -os_map_pci_resource_internal (uword os_handle, - u32 resource, u8 * addr, void **result) -{ - linux_pci_main_t *pm = &linux_pci_main; - linux_pci_device_t *p; - struct stat stat_buf; - u8 *file_name; - int fd; - clib_error_t *error; - int flags = MAP_SHARED; - - error = 0; - p = pool_elt_at_index (pm->linux_pci_devices, os_handle); - - file_name = format (0, "%v/resource%d%c", p->dev_dir_name, resource, 0); - fd = open ((char *) file_name, O_RDWR); - if (fd < 0) - { - error = clib_error_return_unix (0, "open `%s'", file_name); - goto done; - } - - if (fstat (fd, &stat_buf) < 0) - { - error = clib_error_return_unix (0, "fstat `%s'", file_name); - goto done; - } - - vec_validate (p->resource_fds, resource); - p->resource_fds[resource] = fd; - if (addr != 0) - flags |= MAP_FIXED; - - *result = mmap (addr, - /* size */ stat_buf.st_size, - PROT_READ | PROT_WRITE, flags, - /* file */ fd, - /* offset */ 0); - if (*result == (void *) -1) - { - error = clib_error_return_unix (0, "mmap `%s'", file_name); - goto done; - } - -done: - if (error) - { - if (fd >= 0) - close (fd); - } - vec_free (file_name); - return error; -} - -clib_error_t * -vlib_pci_map_resource (vlib_pci_device_t * dev, u32 resource, void **result) -{ - return (os_map_pci_resource_internal - (dev->os_handle, resource, 0 /* addr */ , - result)); -} - -clib_error_t * -vlib_pci_map_resource_fixed (vlib_pci_device_t * dev, - u32 resource, u8 * addr, void **result) -{ - return (os_map_pci_resource_internal - (dev->os_handle, resource, addr, result)); -} - -void -vlib_pci_free_device (vlib_pci_device_t * dev) -{ - linux_pci_main_t *pm = &linux_pci_main; - linux_pci_device_t *l; - - l = pool_elt_at_index (pm->linux_pci_devices, dev->os_handle); - linux_pci_device_free (l); - pool_put (pm->linux_pci_devices, l); -} - -pci_device_registration_t * __attribute__ ((unused)) -pci_device_next_registered (pci_device_registration_t * r) -{ - uword i; - - /* Null vendor id marks end of initialized list. */ - for (i = 0; r->supported_devices[i].vendor_id != 0; i++) - ; - - return clib_elf_section_data_next (r, i * sizeof (r->supported_devices[0])); -} - -static clib_error_t * -init_device_from_registered (vlib_main_t * vm, - vlib_pci_device_t * dev, - linux_pci_device_t * pdev) -{ - vlib_pci_main_t *pm = &pci_main; - pci_device_registration_t *r; - pci_device_id_t *i; - clib_error_t *error; - - r = pm->pci_device_registrations; - - while (r) - { - for (i = r->supported_devices; i->vendor_id != 0; i++) - if (i->vendor_id == dev->vendor_id && i->device_id == dev->device_id) - { - error = vlib_pci_bind_to_uio (dev, "uio_pci_generic"); - if (error) - { - clib_error_report (error); - continue; - } - - add_device (dev, pdev); - dev->interrupt_handler = r->interrupt_handler; - return r->init_function (vm, dev); - } - r = r->next_registration; - } - /* No driver, close the PCI config-space FD */ - close (pdev->config_fd); - return 0; -} - -static clib_error_t * -init_device (vlib_main_t * vm, - vlib_pci_device_t * dev, linux_pci_device_t * pdev) -{ - return init_device_from_registered (vm, dev, pdev); -} - -static clib_error_t * -scan_device (void *arg, u8 * dev_dir_name, u8 * ignored) -{ - vlib_main_t *vm = arg; - vlib_pci_main_t *pm = &pci_main; - int fd; - u8 *f; - clib_error_t *error = 0; - vlib_pci_device_t *dev; - linux_pci_device_t pdev = { 0 }; - u32 tmp; - - f = format (0, "%v/config%c", dev_dir_name, 0); - fd = open ((char *) f, O_RDWR); - - /* Try read-only access if write fails. */ - if (fd < 0) - fd = open ((char *) f, O_RDONLY); - - if (fd < 0) - { - error = clib_error_return_unix (0, "open `%s'", f); - goto done; - } - - pool_get (pm->pci_devs, dev); - - /* You can only read more that 64 bytes of config space as root; so we try to - read the full space but fall back to just the first 64 bytes. */ - if (read (fd, &dev->config_data, sizeof (dev->config_data)) != - sizeof (dev->config_data) - && read (fd, &dev->config0, - sizeof (dev->config0)) != sizeof (dev->config0)) - { - pool_put (pm->pci_devs, dev); - error = clib_error_return_unix (0, "read `%s'", f); - close (fd); - goto done; - } - - { - static pci_config_header_t all_ones; - if (all_ones.vendor_id == 0) - memset (&all_ones, ~0, sizeof (all_ones)); - - if (!memcmp (&dev->config0.header, &all_ones, sizeof (all_ones))) - { - pool_put (pm->pci_devs, dev); - error = clib_error_return (0, "invalid PCI config for `%s'", f); - close (fd); - goto done; - } - } - - if (dev->config0.header.header_type == 0) - pci_config_type0_little_to_host (&dev->config0); - else - pci_config_type1_little_to_host (&dev->config1); - - /* Parse bus, dev, function from directory name. */ - { - unformat_input_t input; - - unformat_init_string (&input, (char *) dev_dir_name, - vec_len (dev_dir_name)); - - if (!unformat (&input, "/sys/bus/pci/devices/%U", - unformat_vlib_pci_addr, &dev->bus_address)) - abort (); - - unformat_free (&input); - - } - - - pdev.config_fd = fd; - pdev.dev_dir_name = dev_dir_name; - - hash_set (pm->pci_dev_index_by_pci_addr, dev->bus_address.as_u32, - dev - pm->pci_devs); - - vec_reset_length (f); - f = format (f, "%v/vpd%c", dev_dir_name, 0); - fd = open ((char *) f, O_RDONLY); - if (fd >= 0) - { - while (1) - { - u8 tag[3]; - u8 *data = 0; - int len; - - if (read (fd, &tag, 3) != 3) - break; - - if (tag[0] != 0x82 && tag[0] != 0x90 && tag[0] != 0x91) - break; - - len = (tag[2] << 8) | tag[1]; - vec_validate (data, len); - - if (read (fd, data, len) != len) - { - vec_free (data); - break; - } - if (tag[0] == 0x82) - dev->product_name = data; - else if (tag[0] == 0x90) - dev->vpd_r = data; - else if (tag[0] == 0x91) - dev->vpd_w = data; - - data = 0; - } - close (fd); - } - - dev->numa_node = -1; - vec_reset_length (f); - f = format (f, "%v/numa_node%c", dev_dir_name, 0); - vlib_sysfs_read ((char *) f, "%u", &dev->numa_node); - - vec_reset_length (f); - f = format (f, "%v/class%c", dev_dir_name, 0); - vlib_sysfs_read ((char *) f, "0x%x", &tmp); - dev->device_class = tmp >> 8; - - vec_reset_length (f); - f = format (f, "%v/vendor%c", dev_dir_name, 0); - vlib_sysfs_read ((char *) f, "0x%x", &tmp); - dev->vendor_id = tmp; - - vec_reset_length (f); - f = format (f, "%v/device%c", dev_dir_name, 0); - vlib_sysfs_read ((char *) f, "0x%x", &tmp); - dev->device_id = tmp; - - error = init_device (vm, dev, &pdev); - - vec_reset_length (f); - f = format (f, "%v/driver%c", dev_dir_name, 0); - dev->driver_name = vlib_sysfs_link_to_name ((char *) f); - -done: - vec_free (f); - return error; -} - -clib_error_t * -linux_pci_init (vlib_main_t * vm) -{ - vlib_pci_main_t *pm = &pci_main; - clib_error_t *error; - - pm->vlib_main = vm; - - if ((error = vlib_call_init_function (vm, unix_input_init))) - return error; - - ASSERT (sizeof (vlib_pci_addr_t) == sizeof (u32)); - pm->pci_dev_index_by_pci_addr = hash_create (0, sizeof (uword)); - - error = foreach_directory_file ("/sys/bus/pci/devices", scan_device, vm, - /* scan_dirs */ 0); - - /* Complain and continue. might not be root, etc. */ - if (error) - clib_error_report (error); - - return error; -} - -VLIB_INIT_FUNCTION (linux_pci_init); - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vlib/threads_cli.c b/src/vlib/threads_cli.c index 36f8109e777..f8d5d8f9ad2 100644 --- a/src/vlib/threads_cli.c +++ b/src/vlib/threads_cli.c @@ -18,6 +18,7 @@ #include #include +#include #include static u8 * diff --git a/src/vlib/unix/physmem.c b/src/vlib/unix/physmem.c deleted file mode 100644 index d5d5d6c8ddd..00000000000 --- a/src/vlib/unix/physmem.c +++ /dev/null @@ -1,439 +0,0 @@ -/* - * Copyright (c) 2015 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* - * physmem.c: Unix physical memory - * - * Copyright (c) 2008 Eliot Dresselhaus - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE - * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION - * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION - * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#ifndef __NR_memfd_create -#if defined __x86_64__ -#define __NR_memfd_create 319 -#elif defined __arm__ -#define __NR_memfd_create 385 -#elif defined __aarch64__ -#define __NR_memfd_create 279 -#else -#error "__NR_memfd_create unknown for this architecture" -#endif -#endif - -static inline int -memfd_create (const char *name, unsigned int flags) -{ - return syscall (__NR_memfd_create, name, flags); -} - -#ifndef F_LINUX_SPECIFIC_BASE -#define F_LINUX_SPECIFIC_BASE 1024 -#endif -#define MFD_ALLOW_SEALING 0x0002U -#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) -#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) - -#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ -#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ -#define F_SEAL_GROW 0x0004 /* prevent file from growing */ -#define F_SEAL_WRITE 0x0008 /* prevent writes */ - -static void * -unix_physmem_alloc_aligned (vlib_main_t * vm, vlib_physmem_region_index_t idx, - uword n_bytes, uword alignment) -{ - vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); - uword lo_offset, hi_offset; - uword *to_free = 0; - - if (pr->heap == 0) - return 0; - - /* IO memory is always at least cache aligned. */ - alignment = clib_max (alignment, CLIB_CACHE_LINE_BYTES); - - while (1) - { - mheap_get_aligned (pr->heap, n_bytes, - /* align */ alignment, - /* align offset */ 0, - &lo_offset); - - /* Allocation failed? */ - if (lo_offset == ~0) - break; - - if (pr->flags & VLIB_PHYSMEM_F_FAKE) - break; - - /* Make sure allocation does not span DMA physical chunk boundary. */ - hi_offset = lo_offset + n_bytes - 1; - - if ((lo_offset >> pr->log2_page_size) == - (hi_offset >> pr->log2_page_size)) - break; - - /* Allocation would span chunk boundary, queue it to be freed as soon as - we find suitable chunk. */ - vec_add1 (to_free, lo_offset); - } - - if (to_free != 0) - { - uword i; - for (i = 0; i < vec_len (to_free); i++) - mheap_put (pr->heap, to_free[i]); - vec_free (to_free); - } - - return lo_offset != ~0 ? pr->heap + lo_offset : 0; -} - -static void -unix_physmem_free (vlib_main_t * vm, vlib_physmem_region_index_t idx, void *x) -{ - vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); - /* Return object to region's heap. */ - mheap_put (pr->heap, x - pr->heap); -} - -static u64 -get_page_paddr (int fd, uword addr) -{ - int pagesize = sysconf (_SC_PAGESIZE); - u64 seek, pagemap = 0; - - seek = ((u64) addr / pagesize) * sizeof (u64); - if (lseek (fd, seek, SEEK_SET) != seek) - { - clib_unix_warning ("lseek to 0x%llx", seek); - return 0; - } - if (read (fd, &pagemap, sizeof (pagemap)) != (sizeof (pagemap))) - { - clib_unix_warning ("read ptbits"); - return 0; - } - if ((pagemap & (1ULL << 63)) == 0) - return 0; - - pagemap &= pow2_mask (55); - - return pagemap * pagesize; -} - -static clib_error_t * -unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size, - u8 numa_node, u32 flags, - vlib_physmem_region_index_t * idx) -{ - vlib_physmem_main_t *vpm = &vm->physmem_main; - vlib_physmem_region_t *pr; - clib_error_t *error = 0; - int pagemap_fd = -1; - u8 *mount_dir = 0; - u8 *filename = 0; - struct stat st; - int old_mpol; - int mmap_flags; - struct bitmask *old_mask = numa_allocate_nodemask (); - - if (geteuid () != 0 && (flags & VLIB_PHYSMEM_F_FAKE) == 0) - return clib_error_return (0, "not allowed"); - - pool_get (vpm->regions, pr); - - if ((pr - vpm->regions) >= 256) - { - error = clib_error_return (0, "maximum number of regions reached"); - goto error; - } - - pr->index = pr - vpm->regions; - pr->fd = -1; - pr->flags = flags; - - if (get_mempolicy (&old_mpol, old_mask->maskp, old_mask->size + 1, NULL, 0) - == -1) - { - error = clib_error_return_unix (0, "get_mempolicy"); - goto error; - } - - if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) - { - if ((pagemap_fd = open ((char *) "/proc/self/pagemap", O_RDONLY)) == -1) - { - error = clib_error_return_unix (0, "open '/proc/self/pagemap'"); - goto error; - } - - mount_dir = format (0, "%s/physmem_region%d%c", - vlib_unix_get_runtime_dir (), pr->index, 0); - filename = format (0, "%s/mem%c", mount_dir, 0); - - unlink ((char *) mount_dir); - - error = vlib_unix_recursive_mkdir ((char *) mount_dir); - if (error) - goto error; - - if (mount ("none", (char *) mount_dir, "hugetlbfs", 0, NULL)) - { - error = clib_error_return_unix (0, "mount hugetlb directory '%s'", - mount_dir); - goto error; - } - - if ((pr->fd = open ((char *) filename, O_CREAT | O_RDWR, 0755)) == -1) - { - error = clib_error_return_unix (0, "open"); - goto error; - } - - mmap_flags = MAP_SHARED | MAP_HUGETLB | MAP_LOCKED; - } - else - { - if ((pr->fd = memfd_create (name, MFD_ALLOW_SEALING)) == -1) - return clib_error_return_unix (0, "memfd_create"); - - if ((fcntl (pr->fd, F_ADD_SEALS, F_SEAL_SHRINK)) == -1) - { - error = - clib_error_return_unix (0, "fcntl (F_ADD_SEALS, F_SEAL_SHRINK)"); - goto error; - } - mmap_flags = MAP_SHARED; - } - - if (fstat (pr->fd, &st)) - { - error = clib_error_return_unix (0, "fstat"); - goto error; - } - - pr->log2_page_size = min_log2 (st.st_blksize); - pr->n_pages = ((size - 1) >> pr->log2_page_size) + 1; - size = pr->n_pages * (1 << pr->log2_page_size); - - if ((ftruncate (pr->fd, size)) == -1) - { - error = clib_error_return_unix (0, "ftruncate length: %d", size); - goto error; - } - - if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) - { - error = vlib_sysfs_prealloc_hugepages (numa_node, - 1 << (pr->log2_page_size - 10), - pr->n_pages); - if (error) - goto error; - } - - numa_set_preferred (numa_node); - - pr->mem = mmap (0, size, (PROT_READ | PROT_WRITE), mmap_flags, pr->fd, 0); - - if (pr->mem == MAP_FAILED) - { - pr->mem = 0; - error = clib_error_return_unix (0, "mmap"); - goto error; - } - - if (set_mempolicy (old_mpol, old_mask->maskp, old_mask->size + 1) == -1) - { - error = clib_error_return_unix (0, "set_mempolicy"); - goto error; - } - - pr->size = pr->n_pages << pr->log2_page_size; - pr->page_mask = (1 << pr->log2_page_size) - 1; - pr->numa_node = numa_node; - pr->name = format (0, "%s", name); - - if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) - { - int i; - for (i = 0; i < pr->n_pages; i++) - { - void *ptr = pr->mem + (i << pr->log2_page_size); - int node; - move_pages (0, 1, &ptr, 0, &node, 0); - if (numa_node != node) - { - clib_warning - ("physmem page for region \'%s\' allocated on the wrong" - " numa node (requested %u actual %u)", pr->name, - pr->numa_node, node, i); - break; - } - } - } - - if (flags & VLIB_PHYSMEM_F_INIT_MHEAP) - { - pr->heap = mheap_alloc_with_flags (pr->mem, pr->size, - /* Don't want mheap mmap/munmap with IO memory. */ - MHEAP_FLAG_DISABLE_VM | - MHEAP_FLAG_THREAD_SAFE); - fformat (stdout, "%U", format_mheap, pr->heap, /* verbose */ 1); - } - - if (flags & VLIB_PHYSMEM_F_HAVE_BUFFERS) - { - vlib_buffer_add_mem_range (vm, pointer_to_uword (pr->mem), pr->size); - } - - *idx = pr->index; - - if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) - { - int i; - for (i = 0; i < pr->n_pages; i++) - { - uword vaddr = - pointer_to_uword (pr->mem) + (((u64) i) << pr->log2_page_size); - u64 page_paddr = get_page_paddr (pagemap_fd, vaddr); - vec_add1 (pr->page_table, page_paddr); - } - } - - goto done; - -error: - if (pr->fd > -1) - close (pr->fd); - - if (pr->mem) - munmap (pr->mem, size); - - memset (pr, 0, sizeof (*pr)); - pool_put (vpm->regions, pr); - -done: - if (mount_dir) - { - umount2 ((char *) mount_dir, MNT_DETACH); - rmdir ((char *) mount_dir); - vec_free (mount_dir); - } - numa_free_cpumask (old_mask); - vec_free (filename); - if (pagemap_fd > -1) - close (pagemap_fd); - return error; -} - -static void -unix_physmem_region_free (vlib_main_t * vm, vlib_physmem_region_index_t idx) -{ - vlib_physmem_main_t *vpm = &vm->physmem_main; - vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); - - if (pr->fd > 0) - close (pr->fd); - munmap (pr->mem, pr->size); - vec_free (pr->name); - pool_put (vpm->regions, pr); -} - -clib_error_t * -unix_physmem_init (vlib_main_t * vm) -{ - clib_error_t *error = 0; - - /* Avoid multiple calls. */ - if (vm->os_physmem_alloc_aligned) - return error; - - vm->os_physmem_alloc_aligned = unix_physmem_alloc_aligned; - vm->os_physmem_free = unix_physmem_free; - vm->os_physmem_region_alloc = unix_physmem_region_alloc; - vm->os_physmem_region_free = unix_physmem_region_free; - - return error; -} - -static clib_error_t * -show_physmem (vlib_main_t * vm, - unformat_input_t * input, vlib_cli_command_t * cmd) -{ - vlib_physmem_main_t *vpm = &vm->physmem_main; - vlib_physmem_region_t *pr; - - /* *INDENT-OFF* */ - pool_foreach (pr, vpm->regions, ( - { - vlib_cli_output (vm, "index %u name '%s' page-size %uKB num-pages %d " - "numa-node %u fd %d\n", - pr->index, pr->name, (1 << (pr->log2_page_size -10)), - pr->n_pages, pr->numa_node, pr->fd); - if (pr->heap) - vlib_cli_output (vm, " %U", format_mheap, pr->heap, /* verbose */ 1); - else - vlib_cli_output (vm, " no heap\n"); - })); - /* *INDENT-ON* */ - return 0; -} - -/* *INDENT-OFF* */ -VLIB_CLI_COMMAND (show_physmem_command, static) = { - .path = "show physmem", - .short_help = "Show physical memory allocation", - .function = show_physmem, -}; -/* *INDENT-ON* */ - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vlib/unix/unix.h b/src/vlib/unix/unix.h index b5a33427d36..1b0d8b9d384 100644 --- a/src/vlib/unix/unix.h +++ b/src/vlib/unix/unix.h @@ -217,23 +217,6 @@ extern u8 **vlib_thread_stacks; /* utils */ -clib_error_t *vlib_sysfs_write (char *file_name, char *fmt, ...); - -clib_error_t *vlib_sysfs_read (char *file_name, char *fmt, ...); - -u8 *vlib_sysfs_link_to_name (char *link); - -clib_error_t *vlib_sysfs_set_nr_hugepages (unsigned int numa_node, - int page_size, int nr); -clib_error_t *vlib_sysfs_get_nr_hugepages (unsigned int numa_node, - int page_size, int *v); -clib_error_t *vlib_sysfs_get_free_hugepages (unsigned int numa_node, - int page_size, int *v); -clib_error_t *vlib_sysfs_get_surplus_hugepages (unsigned int numa_node, - int page_size, int *v); -clib_error_t *vlib_sysfs_prealloc_hugepages (unsigned int numa_node, - int page_size, int nr); - clib_error_t *foreach_directory_file (char *dir_name, clib_error_t * (*f) (void *arg, u8 * path_name, diff --git a/src/vlib/unix/util.c b/src/vlib/unix/util.c index 0e252aca345..5472751e079 100644 --- a/src/vlib/unix/util.c +++ b/src/vlib/unix/util.c @@ -98,225 +98,6 @@ foreach_directory_file (char *dir_name, return error; } -clib_error_t * -vlib_sysfs_write (char *file_name, char *fmt, ...) -{ - u8 *s; - int fd; - clib_error_t *error = 0; - - fd = open (file_name, O_WRONLY); - if (fd < 0) - return clib_error_return_unix (0, "open `%s'", file_name); - - va_list va; - va_start (va, fmt); - s = va_format (0, fmt, &va); - va_end (va); - - if (write (fd, s, vec_len (s)) < 0) - error = clib_error_return_unix (0, "write `%s'", file_name); - - vec_free (s); - close (fd); - return error; -} - -clib_error_t * -vlib_sysfs_read (char *file_name, char *fmt, ...) -{ - unformat_input_t input; - u8 *s = 0; - int fd; - ssize_t sz; - uword result; - - fd = open (file_name, O_RDONLY); - if (fd < 0) - return clib_error_return_unix (0, "open `%s'", file_name); - - vec_validate (s, 4095); - - sz = read (fd, s, vec_len (s)); - if (sz < 0) - { - close (fd); - vec_free (s); - return clib_error_return_unix (0, "read `%s'", file_name); - } - - _vec_len (s) = sz; - unformat_init_vector (&input, s); - - va_list va; - va_start (va, fmt); - result = va_unformat (&input, fmt, &va); - va_end (va); - - vec_free (s); - close (fd); - - if (result == 0) - return clib_error_return (0, "unformat error"); - - return 0; -} - -u8 * -vlib_sysfs_link_to_name (char *link) -{ - char *p, buffer[64]; - unformat_input_t in; - u8 *s = 0; - int r; - - r = readlink (link, buffer, sizeof (buffer) - 1); - - if (r < 0) - return 0; - - buffer[r] = 0; - p = strrchr (buffer, '/'); - - if (!p) - return 0; - - unformat_init_string (&in, p + 1, strlen (p + 1)); - if (unformat (&in, "%s", &s) != 1) - clib_unix_warning ("no string?"); - unformat_free (&in); - - return s; -} - -clib_error_t * -vlib_sysfs_set_nr_hugepages (unsigned int numa_node, int page_size, int nr) -{ - clib_error_t *error = 0; - struct stat sb; - u8 *p = 0; - - p = format (p, "/sys/devices/system/node/node%u%c", numa_node, 0); - - if (stat ((char *) p, &sb) == 0) - { - if (S_ISDIR (sb.st_mode) == 0) - { - error = clib_error_return (0, "'%s' is not directory", p); - goto done; - } - } - else if (numa_node == 0) - { - vec_reset_length (p); - p = format (p, "/sys/kernel/mm%c", 0); - if (stat ((char *) p, &sb) < 0 || S_ISDIR (sb.st_mode) == 0) - { - error = clib_error_return (0, "'%s' does not exist or it is not " - "directory", p); - goto done; - } - } - else - { - error = clib_error_return (0, "'%s' does not exist", p); - goto done; - } - - _vec_len (p) -= 1; - p = format (p, "/hugepages/hugepages-%ukB/nr_hugepages%c", page_size, 0); - vlib_sysfs_write ((char *) p, "%d", nr); - -done: - vec_free (p); - return error; -} - - -static clib_error_t * -vlib_sysfs_get_xxx_hugepages (char *type, unsigned int numa_node, - int page_size, int *val) -{ - clib_error_t *error = 0; - struct stat sb; - u8 *p = 0; - - p = format (p, "/sys/devices/system/node/node%u%c", numa_node, 0); - - if (stat ((char *) p, &sb) == 0) - { - if (S_ISDIR (sb.st_mode) == 0) - { - error = clib_error_return (0, "'%s' is not directory", p); - goto done; - } - } - else if (numa_node == 0) - { - vec_reset_length (p); - p = format (p, "/sys/kernel/mm%c", 0); - if (stat ((char *) p, &sb) < 0 || S_ISDIR (sb.st_mode) == 0) - { - error = clib_error_return (0, "'%s' does not exist or it is not " - "directory", p); - goto done; - } - } - else - { - error = clib_error_return (0, "'%s' does not exist", p); - goto done; - } - - _vec_len (p) -= 1; - p = format (p, "/hugepages/hugepages-%ukB/%s_hugepages%c", page_size, - type, 0); - error = vlib_sysfs_read ((char *) p, "%d", val); - -done: - vec_free (p); - return error; -} - -clib_error_t * -vlib_sysfs_get_free_hugepages (unsigned int numa_node, int page_size, int *v) -{ - return vlib_sysfs_get_xxx_hugepages ("free", numa_node, page_size, v); -} - -clib_error_t * -vlib_sysfs_get_nr_hugepages (unsigned int numa_node, int page_size, int *v) -{ - return vlib_sysfs_get_xxx_hugepages ("nr", numa_node, page_size, v); -} - -clib_error_t * -vlib_sysfs_get_surplus_hugepages (unsigned int numa_node, int page_size, - int *v) -{ - return vlib_sysfs_get_xxx_hugepages ("surplus", numa_node, page_size, v); -} - -clib_error_t * -vlib_sysfs_prealloc_hugepages (unsigned int numa_node, int page_size, int nr) -{ - clib_error_t *error = 0; - int n, needed; - error = vlib_sysfs_get_free_hugepages (numa_node, page_size, &n); - if (error) - return error; - needed = nr - n; - if (needed <= 0) - return 0; - - error = vlib_sysfs_get_nr_hugepages (numa_node, page_size, &n); - if (error) - return error; - clib_warning ("pre-allocating %u additional %uK hugepages on numa node %u", - needed, page_size, numa_node); - return vlib_sysfs_set_nr_hugepages (numa_node, page_size, n + needed); -} - clib_error_t * vlib_unix_recursive_mkdir (char *path) { diff --git a/src/vnet/devices/af_packet/af_packet.c b/src/vnet/devices/af_packet/af_packet.c index ea52878dbcf..e7e6921421b 100644 --- a/src/vnet/devices/af_packet/af_packet.c +++ b/src/vnet/devices/af_packet/af_packet.c @@ -26,6 +26,7 @@ #include #include +#include #include #include -- cgit 1.2.3-korg