/* *------------------------------------------------------------------ * svm.c - shared VM allocation, mmap(...MAP_FIXED...) * library * * Copyright (c) 2009 Cisco and/or its affiliates. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *------------------------------------------------------------------ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "svm.h" static svm_region_t *root_rp; static int root_rp_refcount; #define MAXLOCK 2 static pthread_mutex_t *mutexes_held[MAXLOCK]; static int nheld; svm_region_t * svm_get_root_rp (void) { return root_rp; } #define MUTEX_DEBUG u64 svm_get_global_region_base_va () { #if __aarch64__ /* On AArch64 VA space can have different size, from 36 to 48 bits. Here we are trying to detect VA bits by parsing /proc/self/maps address ranges */ int fd; unformat_input_t input; u64 start, end = 0; u8 bits = 0; if ((fd = open ("/proc/self/maps", 0)) < 0) clib_unix_error ("open '/proc/self/maps'"); unformat_init_clib_file (&input, fd); while (unformat_check_input (&input) != UNFORMAT_END_OF_INPUT) { if (unformat (&input, "%llx-%llx", &start, &end)) end--; unformat_skip_line (&input); } unformat_free (&input); close (fd); count_leading_zeros (bits, end); bits = 64 - bits; if (bits >= 36 && bits <= 48) return ((1ul << bits) / 4) - (2 * SVM_GLOBAL_REGION_SIZE); else clib_unix_error ("unexpected va bits '%u'", bits); #endif /* default value */ return 0x30000000; } static void region_lock (svm_region_t * rp, int tag) { pthread_mutex_lock (&rp->mutex); #ifdef MUTEX_DEBUG rp->mutex_owner_pid = getpid (); rp->mutex_owner_tag = tag; #endif ASSERT (nheld < MAXLOCK); /* * Keep score of held mutexes so we can try to exit * cleanly if the world comes to an end at the worst possible * moment */ mutexes_held[nheld++] = &rp->mutex; } static void region_unlock (svm_region_t * rp) { int i, j; #ifdef MUTEX_DEBUG rp->mutex_owner_pid = 0; rp->mutex_owner_tag = 0; #endif for (i = nheld - 1; i >= 0; i--) { if (mutexes_held[i] == &rp->mutex) { for (j = i; j < MAXLOCK - 1; j++) mutexes_held[j] = mutexes_held[j + 1]; nheld--; goto found; } } ASSERT (0); found: CLIB_MEMORY_BARRIER (); pthread_mutex_unlock (&rp->mutex); } static u8 * format_svm_flags (u8 * s, va_list * args) { uword f = va_arg (*args, uword); if (f & SVM_FLAGS_MHEAP) s = format (s, "MHEAP "); if (f & SVM_FLAGS_FILE) s = format (s, "FILE "); if (f & SVM_FLAGS_NODATA) s = format (s, "NODATA "); if (f & SVM_FLAGS_NEED_DATA_INIT) s = format (s, "INIT "); return (s); } static u8 * format_svm_size (u8 * s, va_list * args) { uword size = va_arg (*args, uword); if (size >= (1 << 20)) { s = format (s, "(%d mb)", size >> 20); } else if (size >= (1 << 10)) { s = format (s, "(%d kb)", size >> 10); } else { s = format (s, "(%d bytes)", size); } return (s); } u8 * format_svm_region (u8 * s, va_list * args) { svm_region_t *rp = va_arg (*args, svm_region_t *); int verbose = va_arg (*args, int); int i; uword lo, hi; s = format (s, "%s: base va 0x%x size 0x%x %U\n", rp->region_name, rp->virtual_base, rp->virtual_size, format_svm_size, rp->virtual_size); s = format (s, " user_ctx 0x%x, bitmap_size %d\n", rp->user_ctx, rp->bitmap_size); if (verbose) { s = format (s, " flags: 0x%x %U\n", rp->flags, format_svm_flags, rp->flags); s = format (s, " region_heap 0x%x data_base 0x%x data_heap 0x%x\n", rp->region_heap, rp->data_base, rp->data_heap); } s = format (s, " %d clients, pids: ", vec_len (rp->client_pids)); for (i = 0; i < vec_len (rp->client_pids); i++) s = format (s, "%d ", rp->client_pids[i]); s = format (s, "\n"); if (verbose) { lo = hi = ~0; s = format (s, " VM in use: "); for (i = 0; i < rp->bitmap_size; i++) { if (clib_bitmap_get_no_check (rp->bitmap, i) != 0) { if (lo == ~0) { hi = lo = rp->virtual_base + i * MMAP_PAGESIZE; } else { hi = rp->virtual_base + i * MMAP_PAGESIZE; } } else { if (lo != ~0) { hi = rp->virtual_base + i * MMAP_PAGESIZE - 1; s = format (s, " 0x%x - 0x%x (%dk)\n", lo, hi, (hi - lo) >> 10); lo = hi = ~0; } } } s = format (s, " rgn heap stats: %U", format_mheap, rp->region_heap, 0); if ((rp->flags & SVM_FLAGS_MHEAP) && rp->data_heap) { s = format (s, "\n data heap stats: %U", format_mheap, rp->data_heap, 1); } s = format (s, "\n"); } return (s); } /* * rnd_pagesize * Round to a pagesize multiple, presumably 4k works */ static u64 rnd_pagesize (u64 size) { u64 rv; rv = (size + (MMAP_PAGESIZE - 1)) & ~(MMAP_PAGESIZE - 1); return (rv); } /* * svm_data_region_setup */ static int svm_data_region_create (svm_map_region_args_t * a, svm_region_t * rp) { int fd; u8 junk = 0; uword map_size; map_size = rp->virtual_size - (MMAP_PAGESIZE + (a->pvt_heap_size ? a->pvt_heap_size : SVM_PVT_MHEAP_SIZE)); if (a->flags & SVM_FLAGS_FILE) { struct stat statb; fd = open (a->backing_file, O_RDWR | O_CREAT, 0777); if (fd < 0) { clib_unix_warning ("open"); return -1; } if (fstat (fd, &statb) < 0) { clib_unix_warning ("fstat"); close (fd); return -2; } if (statb.st_mode & S_IFREG) { if (statb.st_size == 0) { if (lseek (fd, map_size, SEEK_SET) == (off_t) - 1) { clib_unix_warning ("seek region size"); close (fd); return -3; } if (write (fd, &junk, 1) != 1) { clib_unix_warning ("set region size"); close (fd); return -3; } } else { map_size = rnd_pagesize (statb.st_size); } } else { map_size = a->backing_mmap_size; } ASSERT (map_size <= rp->virtual_size - (MMAP_PAGESIZE + SVM_PVT_MHEAP_SIZE)); if (mmap (rp->data_base, map_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fd, 0) == MAP_FAILED) { clib_unix_warning ("mmap"); close (fd); return -3; } close (fd); rp->backing_file = (char *) format (0, "%s\0", a->backing_file); rp->flags |= SVM_FLAGS_FILE; } if (a->flags & SVM_FLAGS_MHEAP) { rp->data_heap = mheap_alloc_with_flags ((void *) (rp->data_base), map_size, MHEAP_FLAG_DISABLE_VM); rp->flags |= SVM_FLAGS_MHEAP; } return 0; } static int svm_data_region_map (svm_map_region_args_t * a, svm_region_t * rp) { int fd; u8 junk = 0; uword map_size; struct stat statb; map_size = rp->virtual_size - (MMAP_PAGESIZE + (a->pvt_heap_size ? a->pvt_heap_size : SVM_PVT_MHEAP_SIZE)); if (a->flags & SVM_FLAGS_FILE) { fd = open (a->backing_file, O_RDWR, 0777); if (fd < 0) { clib_unix_warning ("open"); return -1; } if (fstat (fd, &statb) < 0) { clib_unix_warning ("fstat"); close (fd); return -2; } if (statb.st_mode & S_IFREG) { if (statb.st_size == 0) { if (lseek (fd, map_size, SEEK_SET) == (off_t) - 1) { clib_unix_warning ("seek region size"); close (fd); return -3; } if (write (fd, &junk, 1) != 1) { clib_unix_warning ("set region size"); close (fd); return -3; } } else { map_size = rnd_pagesize (statb.st_size); } } else { map_size = a->backing_mmap_size; } ASSERT (map_size <= rp->virtual_size - (MMAP_PAGESIZE + (a->pvt_heap_size ? a->pvt_heap_size : SVM_PVT_MHEAP_SIZE))); if (mmap (rp->data_base, map_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fd, 0) == MAP_FAILED) { clib_unix_warning ("mmap"); close (fd); return -3; } close (fd); } return 0; } u8 * shm_name_from_svm_map_region_args (svm_map_region_args_t * a) { u8 *path; u8 *shm_name; u8 *split_point; u8 *mkdir_arg = 0; int root_path_offset = 0; int name_offset = 0; if (a->root_path) { /* Tolerate present or absent slashes */ if (a->root_path[0] == '/') root_path_offset++; /* create the root_path under /dev/shm iterate through path creating directories */ path = format (0, "/dev/shm/%s%c", &a->root_path[root_path_offset], 0); split_point = path + 1; vec_add1 (mkdir_arg, '-'); while (*split_point) { while (*split_point && *split_point != '/') { vec_add1 (mkdir_arg, *split_point); split_point++; } vec_add1 (mkdir_arg, 0); /* ready to descend another level */ mkdir_arg[vec_len (mkdir_arg) - 1] = '-'; split_point++; } vec_free (mkdir_arg); vec_free (path); if (a->name[0] == '/') name_offset = 1; shm_name = format (0, "/%s-%s%c", &a->root_path[root_path_offset], &a->name[name_offset], 0); } else shm_name = format (0, "%s%c", a->name, 0); return (shm_name); } void svm_region_init_mapped_region (svm_map_region_args_t * a, svm_region_t * rp) { pthread_mutexattr_t attr; pthread_condattr_t cattr; int nbits, words, bit; int overhead_space; void *oldheap; uword data_base; ASSERT (rp); int rv; memset (rp, 0, sizeof (*rp)); if (pthread_mutexattr_init (&attr)) clib_unix_warning ("mutexattr_init"); if (pthread_mutexattr_setpshared (&attr, PTHREAD_PROCESS_SHARED)) clib_unix_warning ("mutexattr_setpshared"); if (pthread_mutex_init (&rp->mutex, &attr)) clib_unix_warning ("mutex_init"); if (pthread_mutexattr_destroy (&attr)) clib_unix_warning ("mutexattr_destroy"); if (pthread_condattr_init (&cattr)) clib_unix_warning ("condattr_init"); if (pthread_condattr_setpshared (&cattr, PTHREAD_PROCESS_SHARED)) clib_unix_warning ("condattr_setpshared"); if (pthread_cond_init (&rp->condvar, &cattr)) clib_unix_warning ("cond_init"); if (pthread_condattr_destroy (&cattr)) clib_unix_warning ("condattr_destroy"); region_lock (rp, 1); rp->virtual_base = a->baseva; rp->virtual_size = a->size; rp->region_heap = mheap_alloc_with_flags (uword_to_pointer (a->baseva + MMAP_PAGESIZE, void *), (a->pvt_heap_size != 0) ? a->pvt_heap_size : SVM_PVT_MHEAP_SIZE, MHEAP_FLAG_DISABLE_VM); oldheap = svm_push_pvt_heap (rp); rp->region_name = (char *) format (0, "%s%c", a->name, 0); vec_add1 (rp->client_pids, getpid ()); nbits = rp->virtual_size / MMAP_PAGESIZE; ASSERT (nbits > 0); rp->bitmap_size = nbits; words = (nbits + BITS (uword) - 1) / BITS (uword); vec_validate (rp->bitmap, words - 1); overhead_space = MMAP_PAGESIZE /* header */ + ((a->pvt_heap_size != 0) ? a->pvt_heap_size : SVM_PVT_MHEAP_SIZE); bit = 0; data_base = (uword) rp->virtual_base; if (a->flags & SVM_FLAGS_NODATA) rp->flags |= SVM_FLAGS_NEED_DATA_INIT; do { clib_bitmap_set_no_check (rp->bitmap, bit, 1); bit++; overhead_space -= MMAP_PAGESIZE; data_base += MMAP_PAGESIZE; } while (overhead_space > 0); rp->data_base = (void *) data_base; /* * Note: although the POSIX spec guarantees th
create memif socket id $sid1 filename $socket1
create interface memif id $mid1 socket-id $sid1 hw-addr $mac1 master
set interface ip address memif$sid1/$mid1 1.1.$sid1.1/24
set interface state memif$sid1/$mid1 up

create memif socket id $sid2 filename $socket2
create interface memif id $mid2 socket-id $sid2 hw-addr $mac2 master
set interface ip address memif$sid2/$mid2 1.1.$sid2.1/24
set interface state memif$sid2/$mid2 up

set ip neighbor static memif$sid1/$mid1 1.1.$sid1.2 $vif1_mac[0]
set ip neighbor static memif$sid2/$mid2 1.1.$sid2.2 $vif2_mac[0]

ip route add 10.0.0.0/8 via 1.1.$sid1.2
ip route add 20.0.0.0/8 via 1.1.$sid2.2
last customer, throw it away completely. * The root region mutex guarantees atomicity with respect to * a new region client showing up at the wrong moment. */ void svm_region_unmap (void *rp_arg) { int i, mypid = getpid (); int nclients_left; void *oldheap; uword virtual_base, virtual_size; svm_region_t *rp = rp_arg; char *name; /* * If we take a signal while holding one or more shared-memory * mutexes, we may end up back here from an otherwise * benign exit handler. Bail out to avoid a recursive * mutex screw-up. */ if (nheld) return; ASSERT (rp); ASSERT (root_rp); if (CLIB_DEBUG > 1) clib_warning ("[%d] unmap region %s", getpid (), rp->region_name); region_lock (root_rp, 5); region_lock (rp, 6); oldheap = svm_push_pvt_heap (rp); /* nb vec_delete() in the loop */ /* Remove the caller from the list of mappers */ for (i = 0; i < vec_len (rp->client_pids); i++) { if (rp->client_pids[i] == mypid) { vec_delete (rp->client_pids, 1, i); goto found; } } clib_warning ("pid %d AWOL", mypid); found: svm_pop_heap (oldheap); nclients_left = vec_len (rp->client_pids); virtual_base = rp->virtual_base; virtual_size = rp->virtual_size; if (nclients_left == 0) { int index, nbits, i; svm_main_region_t *mp; uword *p; svm_subregion_t *subp; /* Kill the region, last guy on his way out */ oldheap = svm_push_pvt_heap (root_rp); name = vec_dup (rp->region_name); virtual_base = rp->virtual_base; virtual_size = rp->virtual_size; /* Figure out which bits to clear in the root region bitmap */ index = (virtual_base - root_rp->virtual_base) / MMAP_PAGESIZE; nbits = (virtual_size + MMAP_PAGESIZE - 1) / MMAP_PAGESIZE; #if CLIB_DEBUG > 1 clib_warning ("clear %d bits at index %d", nbits, index); #endif /* Give back the allocated VM */ for (i = 0; i < nbits; i++) { clib_bitmap_set_no_check (root_rp->bitmap, index + i, 0); } mp = root_rp->data_base; p = hash_get_mem (mp->name_hash, name); /* Better never happen ... */ if (p == NULL) { region_unlock (rp); region_unlock (root_rp); svm_pop_heap (oldheap); clib_warning ("Region name '%s' not found?", name); return; } /* Remove from the root region subregion pool */ subp = mp->subregions + p[0]; pool_put (mp->subregions, subp); hash_unset_mem (mp->name_hash, name); vec_free (name); region_unlock (rp); svm_region_unlink (rp); munmap ((void *) virtual_base, virtual_size); region_unlock (root_rp); svm_pop_heap (oldheap); return; } region_unlock (rp); region_unlock (root_rp); munmap ((void *) virtual_base, virtual_size); } /* * svm_region_exit */ void svm_region_exit () { void *oldheap; int i, mypid = getpid (); uword virtual_base, virtual_size; /* It felt so nice we did it twice... */ if (root_rp == 0) return; if (--root_rp_refcount > 0) return; /* * If we take a signal while holding one or more shared-memory * mutexes, we may end up back here from an otherwise * benign exit handler. Bail out to avoid a recursive * mutex screw-up. */ if (nheld) return; region_lock (root_rp, 7); oldheap = svm_push_pvt_heap (root_rp); virtual_base = root_rp->virtual_base; virtual_size = root_rp->virtual_size; for (i = 0; i < vec_len (root_rp->client_pids); i++) { if (root_rp->client_pids[i] == mypid) { vec_delete (root_rp->client_pids, 1, i); goto found; } } clib_warning ("pid %d AWOL", mypid); found: if (vec_len (root_rp->client_pids) == 0) svm_region_unlink (root_rp); region_unlock (root_rp); svm_pop_heap (oldheap); root_rp = 0; munmap ((void *) virtual_base, virtual_size); } void svm_client_scan_this_region_nolock (svm_region_t * rp) { int j; int mypid = getpid (); void *oldheap; for (j = 0; j < vec_len (rp->client_pids); j++) { if (mypid == rp->client_pids[j]) continue; if (rp->client_pids[j] && (kill (rp->client_pids[j], 0) < 0)) { clib_warning ("%s: cleanup ghost pid %d", rp->region_name, rp->client_pids[j]); /* nb: client vec in rp->region_heap */ oldheap = svm_push_pvt_heap (rp); vec_delete (rp->client_pids, 1, j); j--; svm_pop_heap (oldheap); } } } /* * Scan svm regions for dead clients */ void svm_client_scan (const char *root_path) { int i, j; svm_main_region_t *mp; svm_map_region_args_t *a = 0; svm_region_t *root_rp; svm_region_t *rp; svm_subregion_t *subp; u8 *name = 0; u8 **svm_names = 0; void *oldheap; int mypid = getpid (); vec_validate (a, 0); svm_region_init_chroot (root_path); root_rp = svm_get_root_rp (); pthread_mutex_lock (&root_rp->mutex); mp = root_rp->data_base; for (j = 0; j < vec_len (root_rp->client_pids); j++) { if (mypid == root_rp->client_pids[j]) continue; if (root_rp->client_pids[j] && (kill (root_rp->client_pids[j], 0) < 0)) { clib_warning ("%s: cleanup ghost pid %d", root_rp->region_name, root_rp->client_pids[j]); /* nb: client vec in root_rp->region_heap */ oldheap = svm_push_pvt_heap (root_rp); vec_delete (root_rp->client_pids, 1, j); j--; svm_pop_heap (oldheap); } } /* * Snapshoot names, can't hold root rp mutex across * find_or_create. */ /* *INDENT-OFF* */ pool_foreach (subp, mp->subregions, ({ name = vec_dup (subp->subregion_name); vec_add1(svm_names, name); })); /* *INDENT-ON* */ pthread_mutex_unlock (&root_rp->mutex); for (i = 0; i < vec_len (svm_names); i++) { vec_validate (a, 0); a->root_path = root_path; a->name = (char *) svm_names[i]; rp = svm_region_find_or_create (a); if (rp) { pthread_mutex_lock (&rp->mutex); svm_client_scan_this_region_nolock (rp); pthread_mutex_unlock (&rp->mutex); svm_region_unmap (rp); vec_free (svm_names[i]); } vec_free (a); } vec_free (svm_names); svm_region_exit (); vec_free (a); } /* * fd.io coding-style-patch-verification: ON * * Local Variables: * eval: (c-set-style "gnu") * End: */