From 49d66f1f42cbc310e4fa0dc526b9fdb91d0ca220 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Thu, 20 Jul 2017 18:10:35 +0200 Subject: vlib physmem rework This patch adds supprot support for multiple numa-aware physmem regions. Change-Id: I5c69a6f4da33c8ee21bdb8604d52fd2886f2327e Signed-off-by: Damjan Marion --- src/vlib/unix/physmem.c | 572 +++++++++++++++++++++++------------------------- 1 file changed, 274 insertions(+), 298 deletions(-) (limited to 'src/vlib/unix/physmem.c') diff --git a/src/vlib/unix/physmem.c b/src/vlib/unix/physmem.c index 27a5bacffb0..d5d5d6c8ddd 100644 --- a/src/vlib/unix/physmem.c +++ b/src/vlib/unix/physmem.c @@ -37,24 +37,66 @@ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#ifndef __NR_memfd_create +#if defined __x86_64__ +#define __NR_memfd_create 319 +#elif defined __arm__ +#define __NR_memfd_create 385 +#elif defined __aarch64__ +#define __NR_memfd_create 279 +#else +#error "__NR_memfd_create unknown for this architecture" +#endif +#endif + +static inline int +memfd_create (const char *name, unsigned int flags) +{ + return syscall (__NR_memfd_create, name, flags); +} + +#ifndef F_LINUX_SPECIFIC_BASE +#define F_LINUX_SPECIFIC_BASE 1024 +#endif +#define MFD_ALLOW_SEALING 0x0002U +#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) +#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) -static physmem_main_t physmem_main; +#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ +#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ +#define F_SEAL_GROW 0x0004 /* prevent file from growing */ +#define F_SEAL_WRITE 0x0008 /* prevent writes */ static void * -unix_physmem_alloc_aligned (vlib_physmem_main_t * vpm, uword n_bytes, - uword alignment) +unix_physmem_alloc_aligned (vlib_main_t * vm, vlib_physmem_region_index_t idx, + uword n_bytes, uword alignment) { - physmem_main_t *pm = &physmem_main; + vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); uword lo_offset, hi_offset; uword *to_free = 0; + if (pr->heap == 0) + return 0; + /* IO memory is always at least cache aligned. */ alignment = clib_max (alignment, CLIB_CACHE_LINE_BYTES); while (1) { - mheap_get_aligned (pm->heap, n_bytes, + mheap_get_aligned (pr->heap, n_bytes, /* align */ alignment, /* align offset */ 0, &lo_offset); @@ -63,11 +105,14 @@ unix_physmem_alloc_aligned (vlib_physmem_main_t * vpm, uword n_bytes, if (lo_offset == ~0) break; + if (pr->flags & VLIB_PHYSMEM_F_FAKE) + break; + /* Make sure allocation does not span DMA physical chunk boundary. */ hi_offset = lo_offset + n_bytes - 1; - if ((lo_offset >> vpm->log2_n_bytes_per_page) == - (hi_offset >> vpm->log2_n_bytes_per_page)) + if ((lo_offset >> pr->log2_page_size) == + (hi_offset >> pr->log2_page_size)) break; /* Allocation would span chunk boundary, queue it to be freed as soon as @@ -79,380 +124,311 @@ unix_physmem_alloc_aligned (vlib_physmem_main_t * vpm, uword n_bytes, { uword i; for (i = 0; i < vec_len (to_free); i++) - mheap_put (pm->heap, to_free[i]); + mheap_put (pr->heap, to_free[i]); vec_free (to_free); } - return lo_offset != ~0 ? pm->heap + lo_offset : 0; + return lo_offset != ~0 ? pr->heap + lo_offset : 0; } static void -unix_physmem_free (void *x) +unix_physmem_free (vlib_main_t * vm, vlib_physmem_region_index_t idx, void *x) { - physmem_main_t *pm = &physmem_main; - + vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); /* Return object to region's heap. */ - mheap_put (pm->heap, x - pm->heap); + mheap_put (pr->heap, x - pr->heap); } -static void -htlb_shutdown (void) +static u64 +get_page_paddr (int fd, uword addr) { - physmem_main_t *pm = &physmem_main; - - if (!pm->shmid) - return; - shmctl (pm->shmid, IPC_RMID, 0); - pm->shmid = 0; -} + int pagesize = sysconf (_SC_PAGESIZE); + u64 seek, pagemap = 0; -/* try to use huge TLB pgs if possible */ -static int -htlb_init (vlib_main_t * vm) -{ - vlib_physmem_main_t *vpm = &vm->physmem_main; - physmem_main_t *pm = &physmem_main; - u64 hugepagesize, pagesize; - u64 pfn, seek_loc; - u64 cur, physaddr, ptbits; - int fd, i; - - pm->shmid = shmget (11 /* key, my amp goes to 11 */ , pm->mem_size, - IPC_CREAT | SHM_HUGETLB | SHM_R | SHM_W); - if (pm->shmid < 0) + seek = ((u64) addr / pagesize) * sizeof (u64); + if (lseek (fd, seek, SEEK_SET) != seek) { - clib_unix_warning ("shmget"); + clib_unix_warning ("lseek to 0x%llx", seek); return 0; } - - pm->mem = shmat (pm->shmid, NULL, 0 /* flags */ ); - if (pm->mem == 0) + if (read (fd, &pagemap, sizeof (pagemap)) != (sizeof (pagemap))) { - shmctl (pm->shmid, IPC_RMID, 0); + clib_unix_warning ("read ptbits"); return 0; } + if ((pagemap & (1ULL << 63)) == 0) + return 0; - memset (pm->mem, 0, pm->mem_size); + pagemap &= pow2_mask (55); - /* $$$ get page size info from /proc/meminfo */ - hugepagesize = 2 << 20; - pagesize = 4 << 10; - vpm->log2_n_bytes_per_page = min_log2 (hugepagesize); - vec_resize (vpm->page_table, pm->mem_size / hugepagesize); + return pagemap * pagesize; +} - vpm->page_mask = pow2_mask (vpm->log2_n_bytes_per_page); - vpm->virtual.start = pointer_to_uword (pm->mem); - vpm->virtual.size = pm->mem_size; - vpm->virtual.end = vpm->virtual.start + vpm->virtual.size; +static clib_error_t * +unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size, + u8 numa_node, u32 flags, + vlib_physmem_region_index_t * idx) +{ + vlib_physmem_main_t *vpm = &vm->physmem_main; + vlib_physmem_region_t *pr; + clib_error_t *error = 0; + int pagemap_fd = -1; + u8 *mount_dir = 0; + u8 *filename = 0; + struct stat st; + int old_mpol; + int mmap_flags; + struct bitmask *old_mask = numa_allocate_nodemask (); - fd = open ("/proc/self/pagemap", O_RDONLY); + if (geteuid () != 0 && (flags & VLIB_PHYSMEM_F_FAKE) == 0) + return clib_error_return (0, "not allowed"); - if (fd < 0) + pool_get (vpm->regions, pr); + + if ((pr - vpm->regions) >= 256) { - (void) shmdt (pm->mem); - return 0; + error = clib_error_return (0, "maximum number of regions reached"); + goto error; } - pm->heap = mheap_alloc_with_flags (pm->mem, pm->mem_size, - /* Don't want mheap mmap/munmap with IO memory. */ - MHEAP_FLAG_DISABLE_VM | - MHEAP_FLAG_THREAD_SAFE); + pr->index = pr - vpm->regions; + pr->fd = -1; + pr->flags = flags; - cur = pointer_to_uword (pm->mem); - i = 0; + if (get_mempolicy (&old_mpol, old_mask->maskp, old_mask->size + 1, NULL, 0) + == -1) + { + error = clib_error_return_unix (0, "get_mempolicy"); + goto error; + } - while (cur < pointer_to_uword (pm->mem) + pm->mem_size) + if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) { - pfn = (u64) cur / pagesize; - seek_loc = pfn * sizeof (u64); - if (lseek (fd, seek_loc, SEEK_SET) != seek_loc) - { - clib_unix_warning ("lseek to 0x%llx", seek_loc); - shmctl (pm->shmid, IPC_RMID, 0); - close (fd); - return 0; - } - if (read (fd, &ptbits, sizeof (ptbits)) != (sizeof (ptbits))) + if ((pagemap_fd = open ((char *) "/proc/self/pagemap", O_RDONLY)) == -1) { - clib_unix_warning ("read ptbits"); - shmctl (pm->shmid, IPC_RMID, 0); - close (fd); - return 0; + error = clib_error_return_unix (0, "open '/proc/self/pagemap'"); + goto error; } - /* bits 0-54 are the physical page number */ - physaddr = (ptbits & 0x7fffffffffffffULL) * pagesize; - if (CLIB_DEBUG > 1) - fformat (stderr, "pm: virtual 0x%llx physical 0x%llx\n", - cur, physaddr); - vpm->page_table[i++] = physaddr; + mount_dir = format (0, "%s/physmem_region%d%c", + vlib_unix_get_runtime_dir (), pr->index, 0); + filename = format (0, "%s/mem%c", mount_dir, 0); - cur += hugepagesize; - } - close (fd); - atexit (htlb_shutdown); - return 1; -} - -int vlib_app_physmem_init (vlib_main_t * vm, - physmem_main_t * pm, int) __attribute__ ((weak)); -int -vlib_app_physmem_init (vlib_main_t * vm, physmem_main_t * pm, int x) -{ - return 0; -} - -clib_error_t * -unix_physmem_init (vlib_main_t * vm, int physical_memory_required) -{ - vlib_physmem_main_t *vpm = &vm->physmem_main; - physmem_main_t *pm = &physmem_main; - clib_error_t *error = 0; - - /* Avoid multiple calls. */ - if (vm->os_physmem_alloc_aligned) - return error; + unlink ((char *) mount_dir); - vm->os_physmem_alloc_aligned = unix_physmem_alloc_aligned; - vm->os_physmem_free = unix_physmem_free; - pm->mem = MAP_FAILED; + error = vlib_unix_recursive_mkdir ((char *) mount_dir); + if (error) + goto error; - if (pm->mem_size == 0) - pm->mem_size = 16 << 20; + if (mount ("none", (char *) mount_dir, "hugetlbfs", 0, NULL)) + { + error = clib_error_return_unix (0, "mount hugetlb directory '%s'", + mount_dir); + goto error; + } - /* OK, Mr. App, you tell us */ - if (vlib_app_physmem_init (vm, pm, physical_memory_required)) - return 0; + if ((pr->fd = open ((char *) filename, O_CREAT | O_RDWR, 0755)) == -1) + { + error = clib_error_return_unix (0, "open"); + goto error; + } - if (!pm->no_hugepages && htlb_init (vm)) + mmap_flags = MAP_SHARED | MAP_HUGETLB | MAP_LOCKED; + } + else { - fformat (stderr, "%s: use huge pages\n", __FUNCTION__); - return 0; + if ((pr->fd = memfd_create (name, MFD_ALLOW_SEALING)) == -1) + return clib_error_return_unix (0, "memfd_create"); + + if ((fcntl (pr->fd, F_ADD_SEALS, F_SEAL_SHRINK)) == -1) + { + error = + clib_error_return_unix (0, "fcntl (F_ADD_SEALS, F_SEAL_SHRINK)"); + goto error; + } + mmap_flags = MAP_SHARED; } - pm->mem = - mmap (0, pm->mem_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (pm->mem == MAP_FAILED) + if (fstat (pr->fd, &st)) { - error = clib_error_return_unix (0, "mmap"); - goto done; + error = clib_error_return_unix (0, "fstat"); + goto error; } - pm->heap = mheap_alloc (pm->mem, pm->mem_size); - - /* Identity map with a single page. */ - vpm->log2_n_bytes_per_page = min_log2 (pm->mem_size); - vec_add1 (vpm->page_table, pointer_to_uword (pm->mem)); - - vpm->page_mask = pow2_mask (vpm->log2_n_bytes_per_page); - vpm->virtual.start = pointer_to_uword (pm->mem); - vpm->virtual.size = pm->mem_size; - vpm->virtual.end = vpm->virtual.start + vpm->virtual.size; - vpm->is_fake = 1; + pr->log2_page_size = min_log2 (st.st_blksize); + pr->n_pages = ((size - 1) >> pr->log2_page_size) + 1; + size = pr->n_pages * (1 << pr->log2_page_size); - fformat (stderr, "%s: use fake dma pages\n", __FUNCTION__); + if ((ftruncate (pr->fd, size)) == -1) + { + error = clib_error_return_unix (0, "ftruncate length: %d", size); + goto error; + } -done: - if (error) + if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) { - if (pm->mem != MAP_FAILED) - munmap (pm->mem, pm->mem_size); + error = vlib_sysfs_prealloc_hugepages (numa_node, + 1 << (pr->log2_page_size - 10), + pr->n_pages); + if (error) + goto error; } - return error; -} -static clib_error_t * -show_physmem (vlib_main_t * vm, - unformat_input_t * input, vlib_cli_command_t * cmd) -{ - physmem_main_t *pm = &physmem_main; + numa_set_preferred (numa_node); - if (pm->heap) - vlib_cli_output (vm, "%U", format_mheap, pm->heap, /* verbose */ 1); - else - vlib_cli_output (vm, "No physmem allocated."); - return 0; -} + pr->mem = mmap (0, size, (PROT_READ | PROT_WRITE), mmap_flags, pr->fd, 0); -/* *INDENT-OFF* */ -VLIB_CLI_COMMAND (show_physmem_command, static) = { - .path = "show physmem", - .short_help = "Show physical memory allocation", - .function = show_physmem, -}; -/* *INDENT-ON* */ + if (pr->mem == MAP_FAILED) + { + pr->mem = 0; + error = clib_error_return_unix (0, "mmap"); + goto error; + } -static clib_error_t * -show_affinity (vlib_main_t * vm, - unformat_input_t * input, vlib_cli_command_t * cmd) -{ - cpu_set_t set; - cpu_set_t *setp = &set; - int i, rv; - u8 *s = 0; - int first_set_bit_in_run = -1; - int last_set_bit_in_run = -1; - int output_done = 0; - - rv = sched_getaffinity (0 /* pid, 0 = this proc */ , - sizeof (*setp), setp); - if (rv < 0) + if (set_mempolicy (old_mpol, old_mask->maskp, old_mask->size + 1) == -1) { - vlib_cli_output (vm, "Couldn't get affinity mask: %s\n", - strerror (errno)); - return 0; + error = clib_error_return_unix (0, "set_mempolicy"); + goto error; } - for (i = 0; i < 64; i++) + pr->size = pr->n_pages << pr->log2_page_size; + pr->page_mask = (1 << pr->log2_page_size) - 1; + pr->numa_node = numa_node; + pr->name = format (0, "%s", name); + + if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) { - if (CPU_ISSET (i, setp)) - { - if (first_set_bit_in_run == -1) - { - first_set_bit_in_run = i; - last_set_bit_in_run = i; - if (output_done) - s = format (s, ","); - s = format (s, "%d-", i); - output_done = 1; - } - else - { - if (i == (last_set_bit_in_run + 1)) - last_set_bit_in_run = i; - } - } - else + int i; + for (i = 0; i < pr->n_pages; i++) { - if (first_set_bit_in_run != -1) + void *ptr = pr->mem + (i << pr->log2_page_size); + int node; + move_pages (0, 1, &ptr, 0, &node, 0); + if (numa_node != node) { - if (first_set_bit_in_run == (i - 1)) - { - _vec_len (s) -= 2 + ((first_set_bit_in_run / 10)); - } - s = format (s, "%d", last_set_bit_in_run); - first_set_bit_in_run = -1; - last_set_bit_in_run = -1; + clib_warning + ("physmem page for region \'%s\' allocated on the wrong" + " numa node (requested %u actual %u)", pr->name, + pr->numa_node, node, i); + break; } } } - if (first_set_bit_in_run != -1) - s = format (s, "%d", first_set_bit_in_run); - - vlib_cli_output (vm, "Process runs on: %v", s); - return 0; -} - -/* *INDENT-OFF* */ -VLIB_CLI_COMMAND (show_affinity_command, static) = { - .path = "show affinity", - .short_help = "Show process cpu affinity", - .function = show_affinity, -}; -/* *INDENT-ON* */ + if (flags & VLIB_PHYSMEM_F_INIT_MHEAP) + { + pr->heap = mheap_alloc_with_flags (pr->mem, pr->size, + /* Don't want mheap mmap/munmap with IO memory. */ + MHEAP_FLAG_DISABLE_VM | + MHEAP_FLAG_THREAD_SAFE); + fformat (stdout, "%U", format_mheap, pr->heap, /* verbose */ 1); + } -static clib_error_t * -set_affinity (vlib_main_t * vm, - unformat_input_t * input, vlib_cli_command_t * cmd) -{ - cpu_set_t set; - cpu_set_t *setp = &set; - int i, rv; - int another_round; - u32 first, last; + if (flags & VLIB_PHYSMEM_F_HAVE_BUFFERS) + { + vlib_buffer_add_mem_range (vm, pointer_to_uword (pr->mem), pr->size); + } - memset (setp, 0, sizeof (*setp)); + *idx = pr->index; - do + if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) { - another_round = 0; - if (unformat (input, "%d-%d,", &first, &last)) + int i; + for (i = 0; i < pr->n_pages; i++) { - if (first > 64 || last > 64) - { - barf1: - vlib_cli_output (vm, "range %d-%d invalid", first, last); - return 0; - } - - for (i = first; i <= last; i++) - CPU_SET (i, setp); - another_round = 1; + uword vaddr = + pointer_to_uword (pr->mem) + (((u64) i) << pr->log2_page_size); + u64 page_paddr = get_page_paddr (pagemap_fd, vaddr); + vec_add1 (pr->page_table, page_paddr); } - else if (unformat (input, "%d-%d", &first, &last)) - { - if (first > 64 || last > 64) - goto barf1; + } - for (i = first; i <= last; i++) - CPU_SET (i, setp); - } - else if (unformat (input, "%d,", &first)) - { - if (first > 64) - { - barf2: - vlib_cli_output (vm, "cpu %d invalid", first); - return 0; - } - CPU_SET (first, setp); - another_round = 1; - } - else if (unformat (input, "%d", &first)) - { - if (first > 64) - goto barf2; + goto done; - CPU_SET (first, setp); - } - } - while (another_round); +error: + if (pr->fd > -1) + close (pr->fd); - rv = sched_setaffinity (0 /* pid, 0 = this proc */ , - sizeof (*setp), setp); + if (pr->mem) + munmap (pr->mem, size); - if (rv < 0) + memset (pr, 0, sizeof (*pr)); + pool_put (vpm->regions, pr); + +done: + if (mount_dir) { - vlib_cli_output (vm, "Couldn't get affinity mask: %s\n", - strerror (errno)); - return 0; + umount2 ((char *) mount_dir, MNT_DETACH); + rmdir ((char *) mount_dir); + vec_free (mount_dir); } - return show_affinity (vm, input, cmd); + numa_free_cpumask (old_mask); + vec_free (filename); + if (pagemap_fd > -1) + close (pagemap_fd); + return error; } -/* *INDENT-OFF* */ -VLIB_CLI_COMMAND (set_affinity_command, static) = { - .path = "set affinity", - .short_help = "Set process cpu affinity", - .function = set_affinity, -}; -/* *INDENT-ON* */ +static void +unix_physmem_region_free (vlib_main_t * vm, vlib_physmem_region_index_t idx) +{ + vlib_physmem_main_t *vpm = &vm->physmem_main; + vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); + + if (pr->fd > 0) + close (pr->fd); + munmap (pr->mem, pr->size); + vec_free (pr->name); + pool_put (vpm->regions, pr); +} + +clib_error_t * +unix_physmem_init (vlib_main_t * vm) +{ + clib_error_t *error = 0; + + /* Avoid multiple calls. */ + if (vm->os_physmem_alloc_aligned) + return error; + + vm->os_physmem_alloc_aligned = unix_physmem_alloc_aligned; + vm->os_physmem_free = unix_physmem_free; + vm->os_physmem_region_alloc = unix_physmem_region_alloc; + vm->os_physmem_region_free = unix_physmem_region_free; + + return error; +} static clib_error_t * -vlib_physmem_configure (vlib_main_t * vm, unformat_input_t * input) +show_physmem (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) { - physmem_main_t *pm = &physmem_main; - u32 size_in_mb; + vlib_physmem_main_t *vpm = &vm->physmem_main; + vlib_physmem_region_t *pr; - while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + /* *INDENT-OFF* */ + pool_foreach (pr, vpm->regions, ( { - if (unformat (input, "no-huge") || unformat (input, "no-huge-pages")) - pm->no_hugepages = 1; - - else if (unformat (input, "size-in-mb %d", &size_in_mb) || - unformat (input, "size %d", &size_in_mb)) - pm->mem_size = size_in_mb << 20; + vlib_cli_output (vm, "index %u name '%s' page-size %uKB num-pages %d " + "numa-node %u fd %d\n", + pr->index, pr->name, (1 << (pr->log2_page_size -10)), + pr->n_pages, pr->numa_node, pr->fd); + if (pr->heap) + vlib_cli_output (vm, " %U", format_mheap, pr->heap, /* verbose */ 1); else - return unformat_parse_error (input); - } - - unformat_free (input); + vlib_cli_output (vm, " no heap\n"); + })); + /* *INDENT-ON* */ return 0; } -VLIB_EARLY_CONFIG_FUNCTION (vlib_physmem_configure, "physmem"); +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_physmem_command, static) = { + .path = "show physmem", + .short_help = "Show physical memory allocation", + .function = show_physmem, +}; +/* *INDENT-ON* */ /* * fd.io coding-style-patch-verification: ON -- cgit 1.2.3-korg