/* * Copyright (c) 2016 Cisco and/or its affiliates. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * pci.c: Linux user space PCI bus management. * * Copyright (c) 2008 Eliot Dresselhaus * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static const char *sysfs_pci_dev_path = "/sys/bus/pci/devices"; static const char *sysfs_pci_drv_path = "/sys/bus/pci/drivers"; static char *sysfs_mod_vfio_noiommu = "/sys/module/vfio/parameters/enable_unsafe_noiommu_mode"; typedef struct { int fd; void *addr; size_t size; } linux_pci_region_t; typedef struct { int fd; u32 clib_file_index; union { pci_intx_handler_function_t *intx_handler; pci_msix_handler_function_t *msix_handler; }; } linux_pci_irq_t; typedef enum { LINUX_PCI_DEVICE_TYPE_UNKNOWN, LINUX_PCI_DEVICE_TYPE_UIO, LINUX_PCI_DEVICE_TYPE_VFIO, } linux_pci_device_type_t; typedef struct { linux_pci_device_type_t type; vlib_pci_dev_handle_t handle; vlib_pci_addr_t addr; /* Resource file descriptors. */ linux_pci_region_t *regions; /* File descriptor for config space read/write. */ int config_fd; u64 config_offset; /* Device File descriptor */ int fd; /* Minor device for uio device. */ u32 uio_minor; /* Interrupt handlers */ linux_pci_irq_t intx_irq; linux_pci_irq_t *msix_irqs; /* private data */ uword private_data; } linux_pci_device_t; /* Pool of PCI devices. */ typedef struct { vlib_main_t *vlib_main; linux_pci_device_t *linux_pci_devices; } linux_pci_main_t; extern linux_pci_main_t linux_pci_main; static linux_pci_device_t * linux_pci_get_device (vlib_pci_dev_handle_t h) { linux_pci_main_t *lpm = &linux_pci_main; return pool_elt_at_index (lpm->linux_pci_devices, h); } uword vlib_pci_get_private_data (vlib_pci_dev_handle_t h) { linux_pci_device_t *d = linux_pci_get_device (h); return d->private_data; } void vlib_pci_set_private_data (vlib_pci_dev_handle_t h, uword private_data) { linux_pci_device_t *d = linux_pci_get_device (h); d->private_data = private_data; } vlib_pci_addr_t * vlib_pci_get_addr (vlib_pci_dev_handle_t h) { linux_pci_device_t *d = linux_pci_get_device (h); return &d->addr; } /* Call to allocate/initialize the pci subsystem. This is not an init function so that users can explicitly enable pci only when it's needed. */ clib_error_t *pci_bus_init (vlib_main_t * vm); linux_pci_main_t linux_pci_main; vlib_pci_device_info_t * vlib_pci_get_device_info (vlib_pci_addr_t * addr, clib_error_t ** error) { linux_vfio_main_t *lvm = &vfio_main; clib_error_t *err; vlib_pci_device_info_t *di; u8 *f = 0; u32 tmp; int fd; di = clib_mem_alloc (sizeof (vlib_pci_device_info_t)); memset (di, 0, sizeof (vlib_pci_device_info_t)); di->addr.as_u32 = addr->as_u32; u8 *dev_dir_name = format (0, "%s/%U", sysfs_pci_dev_path, format_vlib_pci_addr, addr); f = format (0, "%v/config%c", dev_dir_name, 0); fd = open ((char *) f, O_RDWR); /* Try read-only access if write fails. */ if (fd < 0) fd = open ((char *) f, O_RDONLY); if (fd < 0) { err = clib_error_return_unix (0, "open `%s'", f); goto error; } /* You can only read more that 64 bytes of config space as root; so we try to read the full space but fall back to just the first 64 bytes. */ if (read (fd, &di->config_data, sizeof (di->config_data)) < sizeof (di->config0)) { err = clib_error_return_unix (0, "read `%s'", f); close (fd); goto error; } { static pci_config_header_t all_ones; if (all_ones.vendor_id == 0) memset (&all_ones, ~0, sizeof (all_ones)); if (!memcmp (&di->config0.header, &all_ones, sizeof (all_ones))) { err = clib_error_return (0, "invalid PCI config for `%s'", f); close (fd); goto error; } } if (di->config0.header.header_type == 0) pci_config_type0_little_to_host (&di->config0); else pci_config_type1_little_to_host (&di->config1); di->numa_node = -1; vec_reset_length (f); f = format (f, "%v/numa_node%c", dev_dir_name, 0); err = clib_sysfs_read ((char *) f, "%u", &di->numa_node); if (err) { di->numa_node = -1; clib_error_free (err); } vec_reset_length (f); f = format (f, "%v/class%c", dev_dir_name, 0); err = clib_sysfs_read ((char *) f, "0x%x", &tmp); if (err) goto error; di->device_class = tmp >> 8; vec_reset_length (f); f = format (f, "%v/vendor%c", dev_dir_name, 0); err = clib_sysfs_read ((char *) f, "0x%x", &tmp); if (err) goto error; di->vendor_id = tmp; vec_reset_length (f); f = format (f, "%v/device%c", dev_dir_name, 0); err = clib_sysfs_read ((char *) f, "0x%x", &tmp); if (err) goto error; di->device_id = tmp; vec_reset_length (f); f = format (f, "%v/driver%c", dev_dir_name, 0); di->driver_name = clib_sysfs_link_to_name ((char *) f); di->iommu_group = -1; if (lvm->container_fd != -1) { u8 *tmpstr; vec_reset_length (f); f = format (f, "%v/iommu_group%c", dev_dir_name, 0); tmpstr = clib_sysfs_link_to_name ((char *) f); if (tmpstr) { di->iommu_group = atoi ((char *) tmpstr); vec_free (tmpstr); } vec_reset_length (f); f = format (f, "%v/iommu_group/name%c", dev_dir_name, 0); err = clib_sysfs_read ((char *) f, "%s", &tmpstr); if (err == 0) { if (strncmp ((char *) tmpstr, "vfio-noiommu", 12) == 0) di->flags |= VLIB_PCI_DEVICE_INFO_F_NOIOMMU; vec_free (tmpstr); } else clib_error_free (err); } close (fd); vec_reset_length (f); f = format (f, "%v/vpd%c", dev_dir_name, 0); fd = open ((char *) f, O_RDONLY); if (fd >= 0) { while (1) { u8 tag[3]; u8 *data = 0; uword len; if (read (fd, &tag, 3) != 3) break; if (tag[0] != 0x82 && tag[0] != 0x90 && tag[0] != 0x91) break; len = (tag[2] << 8) | tag[1]; vec_validate (data, len); if (read (fd, data, len) != len) { vec_free (data); break; } if (tag[0] == 0x82) di->product_name = data; else if (tag[0] == 0x90) di->vpd_r = data; else if (tag[0] == 0x9
Capturing VPP core dumps
========================

In order to debug a crash of VPP, it is required to provide a coredump
file, which allows backtracing of the VPP issue. The following items are
the requirements for capturing a coredump:

1. Disable k8s Probes to Prevent k8s from Restarting the POD with a Crashed VPP
-------------------------------------------------------------------------------

As described in
`BUG_REPORTS.md <BUG_REPORTS.html#collecting-the-logs-in-case-of-crash-loop>`__.

2. Modify VPP Startup config file
---------------------------------

In ``/etc/vpp/contiv-vswitch.conf``, add the following lines into the
``unix`` section:

::

   unix {
       ...
       coredump-size unlimited
       full-coredump
   }

3. Turn on Coredumps in the Vswitch Container
---------------------------------------------

After re-deploying Contiv-VPP networking, enter bash shell in the
vswitch container (use actual name of the vswitch POD -
``contiv-vswitch-7whk7`` in this case):

::

   kubectl exec -it contiv-vswitch-7whk7 -n kube-system -c contiv-vswitch bash

Enable coredumps:

::

   mkdir -p /tmp/dumps
   sysctl -w debug.exception-trace=1
   sysctl -w kernel.core_pattern="/tmp/dumps/%e-%t"
   ulimit -c unlimited
   echo 2 > /proc/sys/fs/suid_dumpable

4. Let VPP Crash
----------------

Now repeat the steps that lead to the VPP crash. You can also force VPP
to crash at the point where it is running (e.g., if it is stuck) by
using the SIGQUIT signal:

::

   kill -3 `pidof vpp`

5. Locate and Inspect the Core File
-----------------------------------

The core file should appear in ``/tmp/dumps`` in the container:

::

   cd /tmp/dumps
   ls
   vpp_main-1524124440

You can try to backtrace, after installing gdb:

::

   apt-get update && apt-get install gdb
   gdb vpp vpp_main-1524124440
   (gdb) bt

6. Copy the Core File Out of the Container
------------------------------------------

Finally, copy the core file out of the container. First, while still
inside the container, pack the core file into an archive:

::

   cd /tmp/dumps
   tar cvzf vppdump.tar.gz vpp_main-1524124440

Now, on the host, determine the docker ID of the container, and then
copy the file out of the host:

::

   docker ps | grep vswitch_contiv
   d7aceb2e4876        c43a70ac3d01                                             "/usr/bin/supervisor…"   25 minutes ago      Up 25 minutes                           k8s_contiv-vswitch_contiv-vswitch-zqzn6_kube-system_9923952f-43a6-11e8-be84-080027de08ea_0

   docker cp d7aceb2e4876:/tmp/dumps/vppdump.tar.gz .

Now you are ready to file a bug in `jira.fd.io <https://jira.fd.io/>`__
and attach the core file.
irq->fd; } return vfio_set_irqs (p, VFIO_PCI_MSIX_IRQ_INDEX, start, count, VFIO_IRQ_SET_ACTION_TRIGGER, fds); } clib_error_t * vlib_pci_disable_msix_irq (vlib_pci_dev_handle_t h, u16 start, u16 count) { linux_pci_device_t *p = linux_pci_get_device (h); int i, fds[count]; if (p->type != LINUX_PCI_DEVICE_TYPE_VFIO) return clib_error_return (0, "vfio driver is needed for MSI-X interrupt " "support"); for (i = start; i < start + count; i++) fds[i] = -1; return vfio_set_irqs (p, VFIO_PCI_MSIX_IRQ_INDEX, start, count, VFIO_IRQ_SET_ACTION_TRIGGER, fds); } static clib_error_t * add_device_vfio (linux_pci_device_t * p, vlib_pci_device_info_t * di, pci_device_registration_t * r) { linux_pci_main_t *lpm = &linux_pci_main; struct vfio_device_info device_info = { 0 }; struct vfio_region_info reg = { 0 }; clib_error_t *err = 0; u8 *s = 0; p->addr.as_u32 = di->addr.as_u32; p->type = LINUX_PCI_DEVICE_TYPE_VFIO; if (di->driver_name == 0 || (strcmp ("vfio-pci", (char *) di->driver_name) != 0)) return clib_error_return (0, "Device '%U' (iommu group %d) not bound to " "vfio-pci", format_vlib_pci_addr, &di->addr, di->iommu_group); if ((err = linux_vfio_group_get_device_fd (&p->addr, &p->fd))) return err; device_info.argsz = sizeof (device_info); if (ioctl (p->fd, VFIO_DEVICE_GET_INFO, &device_info) < 0) { err = clib_error_return_unix (0, "ioctl(VFIO_DEVICE_GET_INFO) '%U'", format_vlib_pci_addr, &di->addr); goto error; } reg.argsz = sizeof (struct vfio_region_info); reg.index = VFIO_PCI_CONFIG_REGION_INDEX; if (ioctl (p->fd, VFIO_DEVICE_GET_REGION_INFO, ®) < 0) { err = clib_error_return_unix (0, "ioctl(VFIO_DEVICE_GET_INFO) '%U'", format_vlib_pci_addr, &di->addr); goto error; } p->config_offset = reg.offset; p->config_fd = p->fd; /* reset if device supports it */ if (device_info.flags & VFIO_DEVICE_FLAGS_RESET) if (ioctl (p->fd, VFIO_DEVICE_RESET) < 0) { err = clib_error_return_unix (0, "ioctl(VFIO_DEVICE_RESET) '%U'", format_vlib_pci_addr, &di->addr); goto error; } if (r && r->interrupt_handler) { vlib_pci_register_intx_handler (p->handle, r->interrupt_handler); linux_pci_vfio_unmask_intx (p); } if (r && r->init_function) err = r->init_function (lpm->vlib_main, p->handle); error: vec_free (s); if (err) { if (p->fd != -1) close (p->fd); if (p->config_fd != -1 && p->config_fd != p->fd) close (p->config_fd); p->config_fd = p->fd = -1; } return err; } /* Configuration space read/write. */ clib_error_t * vlib_pci_read_write_config (vlib_pci_dev_handle_t h, vlib_read_or_write_t read_or_write, uword address, void *data, u32 n_bytes) { linux_pci_device_t *p = linux_pci_get_device (h); int n; if (read_or_write == VLIB_READ) n = pread (p->config_fd, data, n_bytes, p->config_offset + address); else n = pwrite (p->config_fd, data, n_bytes, p->config_offset + address); if (n != n_bytes) return clib_error_return_unix (0, "%s", read_or_write == VLIB_READ ? "read" : "write"); return 0; } static clib_error_t * vlib_pci_map_region_int (vlib_pci_dev_handle_t h, u32 bar, u8 * addr, void **result) { linux_pci_device_t *p = linux_pci_get_device (h); int fd = -1; clib_error_t *error; int flags = MAP_SHARED; u64 size = 0, offset = 0; ASSERT (bar <= 5); error = 0; if (p->type == LINUX_PCI_DEVICE_TYPE_UIO) { u8 *file_name; struct stat stat_buf; file_name = format (0, "%s/%U/resource%d%c", sysfs_pci_dev_path, format_vlib_pci_addr, &p->addr, bar, 0); fd = open ((char *) file_name, O_RDWR); if (fd < 0) { error = clib_error_return_unix (0, "open `%s'", file_name); vec_free (file_name); return error; } if (fstat (fd, &stat_buf) < 0) { error = clib_error_return_unix (0, "fstat `%s'", file_name); vec_free (file_name); close (fd); return error; } vec_free (file_name); if (addr != 0) flags |= MAP_FIXED; size = stat_buf.st_size; offset = 0; } else if (p->type == LINUX_PCI_DEVICE_TYPE_VFIO) { struct vfio_region_info reg = { 0 }; reg.argsz = sizeof (struct vfio_region_info); reg.index = bar; if (ioctl (p->fd, VFIO_DEVICE_GET_REGION_INFO, ®) < 0) return clib_error_return_unix (0, "ioctl(VFIO_DEVICE_GET_INFO) " "'%U'", format_vlib_pci_addr, &p->addr); fd = p->fd; size = reg.size; offset = reg.offset; } else ASSERT (0); *result = mmap (addr, size, PROT_READ | PROT_WRITE, flags, fd, offset); if (*result == (void *) -1) { error = clib_error_return_unix (0, "mmap `BAR%u'", bar); if (p->type == LINUX_PCI_DEVICE_TYPE_UIO) close (fd); return error; } /* *INDENT-OFF* */ vec_validate_init_empty (p->regions, bar, (linux_pci_region_t) { .fd = -1}); /* *INDENT-ON* */ if (p->type == LINUX_PCI_DEVICE_TYPE_UIO) p->regions[bar].fd = fd; p->regions[bar].addr = *result; p->regions[bar].size = size; return 0; } clib_error_t * vlib_pci_map_region (vlib_pci_dev_handle_t h, u32 resource, void **result) { return (vlib_pci_map_region_int (h, resource, 0 /* addr */ , result)); } clib_error_t * vlib_pci_map_region_fixed (vlib_pci_dev_handle_t h, u32 resource, u8 * addr, void **result) { return (vlib_pci_map_region_int (h, resource, addr, result)); } clib_error_t * vlib_pci_device_open (vlib_pci_addr_t * addr, pci_device_id_t ids[], vlib_pci_dev_handle_t * handle) { linux_pci_main_t *lpm = &linux_pci_main; vlib_pci_device_info_t *di; linux_pci_device_t *p; clib_error_t *err = 0; pci_device_id_t *i; di = vlib_pci_get_device_info (addr, &err); if (err) return err; for (i = ids; i->vendor_id != 0; i++) if (i->vendor_id == di->vendor_id && i->device_id == di->device_id) break; if (i->vendor_id == 0) return clib_error_return (0, "Wrong vendor or device id"); pool_get (lpm->linux_pci_devices, p); p->handle = p - lpm->linux_pci_devices; p->intx_irq.fd = -1; if (di->iommu_group != -1) err = add_device_vfio (p, di, 0); else err = add_device_uio (p, di, 0); if (err) goto error; *handle = p->handle; error: vlib_pci_free_device_info (di); if (err) { memset (p, 0, sizeof (linux_pci_device_t)); pool_put (lpm->linux_pci_devices, p); } return err; } void vlib_pci_device_close (vlib_pci_dev_handle_t h) { linux_pci_main_t *lpm = &linux_pci_main; linux_pci_device_t *p = linux_pci_get_device (h); linux_pci_irq_t *irq; linux_pci_region_t *res; clib_error_t *err = 0; if (p->type == LINUX_PCI_DEVICE_TYPE_UIO) { irq = &p->intx_irq; clib_file_del_by_index (&file_main, irq->clib_file_index); close (p->config_fd); } else if (p->type == LINUX_PCI_DEVICE_TYPE_VFIO) { irq = &p->intx_irq; /* close INTx irqs */ if (irq->fd != -1) { err = vfio_set_irqs (p, VFIO_PCI_INTX_IRQ_INDEX, 0, 0, VFIO_IRQ_SET_ACTION_TRIGGER, 0); clib_error_free (err); clib_file_del_by_index (&file_main, irq->clib_file_index); close (irq->fd); } /* close MSI-X irqs */ if (vec_len (p->msix_irqs)) { err = vfio_set_irqs (p, VFIO_PCI_MSIX_IRQ_INDEX, 0, 0, VFIO_IRQ_SET_ACTION_TRIGGER, 0); clib_error_free (err); /* *INDENT-OFF* */ vec_foreach (irq, p->msix_irqs) { if (irq->fd == -1) continue; clib_file_del_by_index (&file_main, irq->clib_file_index); close (irq->fd); } /* *INDENT-ON* */ vec_free (p->msix_irqs); } } /* *INDENT-OFF* */ vec_foreach (res, p->regions) { if (res->size == 0) continue; munmap (res->addr, res->size); if (res->fd != -1) close (res->fd); } /* *INDENT-ON* */ vec_free (p->regions); close (p->fd); memset (p, 0, sizeof (linux_pci_device_t)); pool_put (lpm->linux_pci_devices, p); } void init_device_from_registered (vlib_pci_device_info_t * di) { vlib_pci_main_t *pm = &pci_main; linux_pci_main_t *lpm = &linux_pci_main; pci_device_registration_t *r; pci_device_id_t *i; clib_error_t *err = 0; linux_pci_device_t *p; pool_get (lpm->linux_pci_devices, p); p->handle = p - lpm->linux_pci_devices; p->intx_irq.fd = -1; r = pm->pci_device_registrations; while (r) { for (i = r->supported_devices; i->vendor_id != 0; i++) if (i->vendor_id == di->vendor_id && i->device_id == di->device_id) { if (di->iommu_group != -1) err = add_device_vfio (p, di, r); else err = add_device_uio (p, di, r); if (err) clib_error_report (err); else return; } r = r->next_registration; } /* No driver, close the PCI config-space FD */ memset (p, 0, sizeof (linux_pci_device_t)); pool_put (lpm->linux_pci_devices, p); } static clib_error_t * scan_pci_addr (void *arg, u8 * dev_dir_name, u8 * ignored) { vlib_pci_addr_t addr, **addrv = arg; unformat_input_t input; clib_error_t *err = 0; unformat_init_string (&input, (char *) dev_dir_name, vec_len (dev_dir_name)); if (!unformat (&input, "/sys/bus/pci/devices/%U", unformat_vlib_pci_addr, &addr)) err = clib_error_return (0, "unformat error `%v`", dev_dir_name); unformat_free (&input); if (err) return err; vec_add1 (*addrv, addr); return 0; } static int pci_addr_cmp (void *v1, void *v2) { vlib_pci_addr_t *a1 = v1; vlib_pci_addr_t *a2 = v2; if (a1->domain > a2->domain) return 1; if (a1->domain < a2->domain) return -1; if (a1->bus > a2->bus) return 1; if (a1->bus < a2->bus) return -1; if (a1->slot > a2->slot) return 1; if (a1->slot < a2->slot) return -1; if (a1->function > a2->function) return 1; if (a1->function < a2->function) return -1; return 0; } vlib_pci_addr_t * vlib_pci_get_all_dev_addrs () { vlib_pci_addr_t *addrs = 0; clib_error_t *err; err = foreach_directory_file ((char *) sysfs_pci_dev_path, scan_pci_addr, &addrs, /* scan_dirs */ 0); if (err) { vec_free (addrs); return 0; } vec_sort_with_function (addrs, pci_addr_cmp); return addrs; } clib_error_t * linux_pci_init (vlib_main_t * vm) { vlib_pci_main_t *pm = &pci_main; vlib_pci_addr_t *addr = 0, *addrs; clib_error_t *error; pm->vlib_main = vm; if ((error = vlib_call_init_function (vm, unix_input_init))) return error; ASSERT (sizeof (vlib_pci_addr_t) == sizeof (u32)); addrs = vlib_pci_get_all_dev_addrs (); /* *INDENT-OFF* */ vec_foreach (addr, addrs) { vlib_pci_device_info_t *d; if ((d = vlib_pci_get_device_info (addr, 0))) { init_device_from_registered (d); vlib_pci_free_device_info (d); } } /* *INDENT-ON* */ return error; } VLIB_INIT_FUNCTION (linux_pci_init); /* * fd.io coding-style-patch-verification: ON * * Local Variables: * eval: (c-set-style "gnu") * End: */