diff options
author | Damjan Marion <damarion@cisco.com> | 2020-09-11 22:16:53 +0200 |
---|---|---|
committer | Damjan Marion <damarion@cisco.com> | 2020-09-17 12:38:41 +0200 |
commit | 6bfd07670b991c30761ef74fb09f42181dbfd182 (patch) | |
tree | f82fa63e1b3ec6e4d31827efb553516936392288 /src/vppinfra/linux | |
parent | c63e2a4f980e09b4274558f0562cee285f9741b5 (diff) |
vppinfra: support main heap with different page sizes
Type: improvement
Change-Id: I381fc3dec8580208d0e24637d791af69011aa83b
Signed-off-by: Damjan Marion <damarion@cisco.com>
Diffstat (limited to 'src/vppinfra/linux')
-rw-r--r-- | src/vppinfra/linux/mem.c | 271 |
1 files changed, 247 insertions, 24 deletions
diff --git a/src/vppinfra/linux/mem.c b/src/vppinfra/linux/mem.c index 42efc007ceb..96fb0db5b73 100644 --- a/src/vppinfra/linux/mem.c +++ b/src/vppinfra/linux/mem.c @@ -50,11 +50,13 @@ #define MFD_HUGETLB 0x0004U #endif -uword -clib_mem_get_page_size (void) -{ - return getpagesize (); -} +#ifndef MAP_HUGE_SHIFT +#define MAP_HUGE_SHIFT 26 +#endif + +#ifndef MAP_FIXED_NOREPLACE +#define MAP_FIXED_NOREPLACE 0x100000 +#endif uword clib_mem_get_default_hugepage_size (void) @@ -166,10 +168,11 @@ clib_mem_get_fd_page_size (int fd) return st.st_blksize; } -int +clib_mem_page_sz_t clib_mem_get_fd_log2_page_size (int fd) { - return min_log2 (clib_mem_get_fd_page_size (fd)); + uword page_size = clib_mem_get_fd_page_size (fd); + return page_size ? min_log2 (page_size) : CLIB_MEM_PAGE_SZ_UNKNOWN; } void @@ -414,43 +417,263 @@ clib_mem_vm_ext_free (clib_mem_vm_alloc_t * a) uword clib_mem_vm_reserve (uword start, uword size, clib_mem_page_sz_t log2_page_sz) { - uword off, pagesize = 1ULL << log2_page_sz; - int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS; - u8 *p; - - if (start) - mmap_flags |= MAP_FIXED; + clib_mem_main_t *mm = &clib_mem_main; + uword pagesize = 1ULL << log2_page_sz; + uword sys_page_sz = 1ULL << mm->log2_page_sz; + uword n_bytes; + void *base = 0, *p; size = round_pow2 (size, pagesize); - p = uword_to_pointer (start, void *); - p = mmap (p, size + pagesize, PROT_NONE, mmap_flags, -1, 0); + /* in adition of requested reservation, we also rserve one system page + * (typically 4K) adjacent to the start off reservation */ + + if (start) + { + /* start address is provided, so we just need to make sure we are not + * replacing existing map */ + if (start & pow2_mask (log2_page_sz)) + return ~0; + + base = (void *) start - sys_page_sz; + base = mmap (base, size + sys_page_sz, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE, -1, 0); + return (base == MAP_FAILED) ? ~0 : start; + } + + /* to make sure that we get reservation aligned to page_size we need to + * request one additional page as mmap will return us address which is + * aligned only to system page size */ + base = mmap (0, size + pagesize, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (p == MAP_FAILED) + if (base == MAP_FAILED) return ~0; - off = round_pow2 ((uword) p, pagesize) - (uword) p; + /* return additional space at the end of allocation */ + p = base + size + pagesize; + n_bytes = (uword) p & pow2_mask (log2_page_sz); + if (n_bytes) + { + p -= n_bytes; + munmap (p, n_bytes); + } - /* trim start and end of reservation to be page aligned */ - if (off) + /* return additional space at the start of allocation */ + n_bytes = pagesize - sys_page_sz - n_bytes; + if (n_bytes) { - munmap (p, off); - p += off; + munmap (base, n_bytes); + base += n_bytes; } - munmap (p + size, pagesize - off); + return (uword) base + sys_page_sz; +} - return (uword) p; +clib_mem_vm_map_hdr_t * +clib_mem_vm_get_next_map_hdr (clib_mem_vm_map_hdr_t * hdr) +{ + clib_mem_main_t *mm = &clib_mem_main; + uword sys_page_sz = 1 << mm->log2_page_sz; + clib_mem_vm_map_hdr_t *next; + if (hdr == 0) + { + hdr = mm->first_map; + if (hdr) + mprotect (hdr, sys_page_sz, PROT_READ); + return hdr; + } + next = hdr->next; + mprotect (hdr, sys_page_sz, PROT_NONE); + if (next) + mprotect (next, sys_page_sz, PROT_READ); + return next; } +void * +clib_mem_vm_map_internal (void *base, clib_mem_page_sz_t log2_page_sz, + uword size, int fd, uword offset, char *name) +{ + clib_mem_main_t *mm = &clib_mem_main; + clib_mem_vm_map_hdr_t *hdr; + uword sys_page_sz = 1 << mm->log2_page_sz; + int mmap_flags = MAP_FIXED, is_huge = 0; + + if (fd != -1) + { + mmap_flags |= MAP_SHARED; + log2_page_sz = clib_mem_get_fd_log2_page_size (fd); + if (log2_page_sz > mm->log2_page_sz) + is_huge = 1; + } + else + { + mmap_flags |= MAP_PRIVATE | MAP_ANONYMOUS; + + if (log2_page_sz == mm->log2_page_sz) + log2_page_sz = CLIB_MEM_PAGE_SZ_DEFAULT; + + switch (log2_page_sz) + { + case CLIB_MEM_PAGE_SZ_UNKNOWN: + /* will fail later */ + break; + case CLIB_MEM_PAGE_SZ_DEFAULT: + log2_page_sz = mm->log2_page_sz; + break; + case CLIB_MEM_PAGE_SZ_DEFAULT_HUGE: + mmap_flags |= MAP_HUGETLB; + log2_page_sz = mm->log2_default_hugepage_sz; + is_huge = 1; + break; + default: + mmap_flags |= MAP_HUGETLB; + mmap_flags |= log2_page_sz << MAP_HUGE_SHIFT; + is_huge = 1; + } + } + + if (log2_page_sz == CLIB_MEM_PAGE_SZ_UNKNOWN) + return CLIB_MEM_VM_MAP_FAILED; + + size = round_pow2 (size, 1 << log2_page_sz); + + base = (void *) clib_mem_vm_reserve ((uword) base, size, log2_page_sz); + + if (base == (void *) ~0) + return CLIB_MEM_VM_MAP_FAILED; + + base = mmap (base, size, PROT_READ | PROT_WRITE, mmap_flags, fd, offset); + + if (base == MAP_FAILED) + return CLIB_MEM_VM_MAP_FAILED; + + if (is_huge && (mlock (base, size) != 0)) + { + munmap (base, size); + return CLIB_MEM_VM_MAP_FAILED; + } + + hdr = mmap (base - sys_page_sz, sys_page_sz, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + + if (hdr != base - sys_page_sz) + { + munmap (base, size); + return CLIB_MEM_VM_MAP_FAILED; + } + + if (mm->last_map) + { + mprotect (mm->last_map, sys_page_sz, PROT_READ | PROT_WRITE); + mm->last_map->next = hdr; + mprotect (mm->last_map, sys_page_sz, PROT_NONE); + } + else + mm->first_map = hdr; + + hdr->next = 0; + hdr->prev = mm->last_map; + mm->last_map = hdr; + + hdr->base_addr = (uword) base; + hdr->log2_page_sz = log2_page_sz; + hdr->num_pages = size >> log2_page_sz; + snprintf (hdr->name, CLIB_VM_MAP_HDR_NAME_MAX_LEN - 1, "%s", (char *) name); + hdr->name[CLIB_VM_MAP_HDR_NAME_MAX_LEN - 1] = 0; + mprotect (hdr, sys_page_sz, PROT_NONE); + + CLIB_MEM_UNPOISON (base, size); + return base; +} + +int +clib_mem_vm_unmap (void *base) +{ + clib_mem_main_t *mm = &clib_mem_main; + uword size, sys_page_sz = 1 << mm->log2_page_sz; + clib_mem_vm_map_hdr_t *hdr = base - sys_page_sz;; + + if (mprotect (hdr, sys_page_sz, PROT_READ | PROT_WRITE) != 0) + return -1; + + size = hdr->num_pages << hdr->log2_page_sz; + if (munmap ((void *) hdr->base_addr, size) != 0) + return -1; + + if (hdr->next) + { + mprotect (hdr->next, sys_page_sz, PROT_READ | PROT_WRITE); + hdr->next->prev = hdr->prev; + mprotect (hdr->next, sys_page_sz, PROT_NONE); + } + else + mm->last_map = hdr->prev; + + if (hdr->prev) + { + mprotect (hdr->prev, sys_page_sz, PROT_READ | PROT_WRITE); + hdr->prev->next = hdr->next; + mprotect (hdr->prev, sys_page_sz, PROT_NONE); + } + else + mm->first_map = hdr->next; + + if (munmap (hdr, sys_page_sz) != 0) + return -1; + + return 0; +} + +void +clib_mem_get_page_stats (void *start, clib_mem_page_sz_t log2_page_size, + uword n_pages, clib_mem_page_stats_t * stats) +{ + int i, *status = 0; + void **ptr = 0; + + log2_page_size = clib_mem_log2_page_size_validate (log2_page_size); + + vec_validate (status, n_pages - 1); + vec_validate (ptr, n_pages - 1); + + for (i = 0; i < n_pages; i++) + ptr[i] = start + (i << log2_page_size); + + clib_memset (stats, 0, sizeof (clib_mem_page_stats_t)); + + if (move_pages (0, n_pages, ptr, 0, status, 0) != 0) + { + stats->unknown = n_pages; + return; + } + + for (i = 0; i < n_pages; i++) + { + if (status[i] >= 0 && status[i] < CLIB_MAX_NUMAS) + { + stats->mapped++; + stats->per_numa[status[i]]++; + } + else if (status[i] == -EFAULT) + stats->not_mapped++; + else + stats->unknown++; + } +} + + u64 * -clib_mem_vm_get_paddr (void *mem, int log2_page_size, int n_pages) +clib_mem_vm_get_paddr (void *mem, clib_mem_page_sz_t log2_page_size, + int n_pages) { int pagesize = sysconf (_SC_PAGESIZE); int fd; int i; u64 *r = 0; + log2_page_size = clib_mem_log2_page_size_validate (log2_page_size); + if ((fd = open ((char *) "/proc/self/pagemap", O_RDONLY)) == -1) return 0; |