From cef1db9c13f57a1fc49c9e500adffafa0b9ca728 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Wed, 28 Mar 2018 18:27:38 +0200 Subject: memif: zero copy slave Change-Id: I65306fb1f8e39221dd1d8c00737a7fb1c0129ba8 Signed-off-by: Damjan Marion --- src/plugins/memif/cli.c | 43 ++++-- src/plugins/memif/device.c | 150 +++++++++++++++++--- src/plugins/memif/memif.c | 260 ++++++++++++++++++++-------------- src/plugins/memif/node.c | 332 +++++++++++++++++++++++++++++++++++++++++++- src/plugins/memif/private.h | 8 +- src/plugins/memif/socket.c | 5 +- src/vlib/buffer_funcs.h | 24 ++++ 7 files changed, 677 insertions(+), 145 deletions(-) (limited to 'src') diff --git a/src/plugins/memif/cli.c b/src/plugins/memif/cli.c index ed2b1b7e6d0..f2c8829aa6a 100644 --- a/src/plugins/memif/cli.c +++ b/src/plugins/memif/cli.c @@ -183,6 +183,8 @@ memif_create_command_fn (vlib_main_t * vm, unformat_input_t * input, if (!unformat_user (input, unformat_line_input, line_input)) return 0; + args.is_zero_copy = 1; + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) { if (unformat (line_input, "id %u", &args.id)) @@ -203,6 +205,8 @@ memif_create_command_fn (vlib_main_t * vm, unformat_input_t * input, args.is_master = 1; else if (unformat (line_input, "slave")) args.is_master = 0; + else if (unformat (line_input, "no-zero-copy")) + args.is_zero_copy = 0; else if (unformat (line_input, "mode ip")) args.mode = MEMIF_INTERFACE_MODE_IP; else if (unformat (line_input, "hw-addr %U", @@ -421,6 +425,7 @@ memif_show_command_fn (vlib_main_t * vm, unformat_input_t * input, memif_main_t *mm = &memif_main; memif_if_t *mif; vnet_main_t *vnm = vnet_get_main (); + memif_region_t *mr; memif_queue_t *mq; uword i; int show_descr = 0; @@ -498,10 +503,10 @@ memif_show_command_fn (vlib_main_t * vm, unformat_input_t * input, vlib_cli_output (vm, " listener-fd %d conn-fd %d", msf->sock ? msf->sock->fd : 0, mif->sock ? mif->sock->fd : 0); - vlib_cli_output (vm, - " num-s2m-rings %u num-m2s-rings %u buffer-size %u", + vlib_cli_output (vm, " num-s2m-rings %u num-m2s-rings %u " + "buffer-size %u num-regions %u", mif->run.num_s2m_rings, mif->run.num_m2s_rings, - mif->run.buffer_size); + mif->run.buffer_size, vec_len (mif->regions)); if (mif->local_disc_string) vlib_cli_output (vm, " local-disc-reason \"%s\"", @@ -510,20 +515,28 @@ memif_show_command_fn (vlib_main_t * vm, unformat_input_t * input, vlib_cli_output (vm, " remote-disc-reason \"%s\"", mif->remote_disc_string); + /* *INDENT-OFF* */ + vec_foreach_index (i, mif->regions) + { + mr = vec_elt_at_index (mif->regions, i); + vlib_cli_output (vm, " region %u size %u fd %d", i, + mr->region_size, mr->fd); + } vec_foreach_index (i, mif->tx_queues) - { - mq = vec_elt_at_index (mif->tx_queues, i); - vlib_cli_output (vm, " %U", format_memif_queue, mq, i); - if (show_descr) - vlib_cli_output (vm, " %U", format_memif_descriptor, mif, mq); - } + { + mq = vec_elt_at_index (mif->tx_queues, i); + vlib_cli_output (vm, " %U", format_memif_queue, mq, i); + if (show_descr) + vlib_cli_output (vm, " %U", format_memif_descriptor, mif, mq); + } vec_foreach_index (i, mif->rx_queues) - { - mq = vec_elt_at_index (mif->rx_queues, i); - vlib_cli_output (vm, " %U", format_memif_queue, mq, i); - if (show_descr) - vlib_cli_output (vm, " %U", format_memif_descriptor, mif, mq); - } + { + mq = vec_elt_at_index (mif->rx_queues, i); + vlib_cli_output (vm, " %U", format_memif_queue, mq, i); + if (show_descr) + vlib_cli_output (vm, " %U", format_memif_descriptor, mif, mq); + } + /* *INDENT-ON* */ } done: vec_free (hw_if_indices); diff --git a/src/plugins/memif/device.c b/src/plugins/memif/device.c index 112db57b4b4..6accad6e673 100644 --- a/src/plugins/memif/device.c +++ b/src/plugins/memif/device.c @@ -99,35 +99,20 @@ memif_add_copy_op (memif_per_thread_data_t * ptd, void *data, u32 len, static_always_inline uword memif_interface_tx_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame, memif_if_t * mif, - memif_ring_type_t type) + memif_ring_type_t type, memif_queue_t * mq, + memif_per_thread_data_t * ptd) { - u8 qid; memif_ring_t *ring; u32 *buffers = vlib_frame_args (frame); u32 n_left = frame->n_vectors; u32 n_copy_op; u16 ring_size, mask, slot, free_slots; - u32 thread_index = vlib_get_thread_index (); - memif_per_thread_data_t *ptd = vec_elt_at_index (memif_main.per_thread_data, - thread_index); - u8 tx_queues = vec_len (mif->tx_queues); - memif_queue_t *mq; int n_retries = 5; vlib_buffer_t *b0, *b1, *b2, *b3; memif_copy_op_t *co; memif_region_index_t last_region = ~0; void *last_region_shm = 0; - if (tx_queues < vec_len (vlib_mains)) - { - ASSERT (tx_queues > 0); - qid = thread_index % tx_queues; - clib_spinlock_lock_if_init (&mif->lockp); - } - else - qid = thread_index; - - mq = vec_elt_at_index (mif->tx_queues, qid); ring = mq->ring; ring_size = 1 << mq->log2_ring_size; mask = ring_size - 1; @@ -307,6 +292,113 @@ no_free_slots: return frame->n_vectors; } +static_always_inline uword +memif_interface_tx_zc_inline (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame, memif_if_t * mif, + memif_queue_t * mq, + memif_per_thread_data_t * ptd) +{ + memif_ring_t *ring = mq->ring; + u32 *buffers = vlib_frame_args (frame); + u32 n_left = frame->n_vectors; + u16 slot, free_slots, n_free; + u16 ring_size = 1 << mq->log2_ring_size; + u16 mask = ring_size - 1; + int n_retries = 5; + vlib_buffer_t *b0; + +retry: + n_free = ring->tail - mq->last_tail; + if (n_free >= 16) + { + vlib_buffer_free_from_ring_no_next (vm, mq->buffers, mq->last_tail, + ring_size, n_free); + mq->last_tail += n_free; + } + + slot = ring->head; + free_slots = ring_size - ring->head + mq->last_tail; + + while (n_left && free_slots) + { + u16 s0; + u16 slots_in_packet = 1; + memif_desc_t *d0; + u32 bi0; + + CLIB_PREFETCH (&ring->desc[(slot + 8) & mask], CLIB_CACHE_LINE_BYTES, + STORE); + + if (PREDICT_TRUE (n_left >= 4)) + vlib_prefetch_buffer_header (vlib_get_buffer (vm, buffers[3]), LOAD); + + bi0 = buffers[0]; + + next_in_chain: + s0 = slot & mask; + d0 = &ring->desc[s0]; + mq->buffers[s0] = bi0; + b0 = vlib_get_buffer (vm, bi0); + + d0->region = b0->buffer_pool_index + 1; + d0->offset = (void *) b0->data + b0->current_data - + mif->regions[d0->region].shm; + d0->length = b0->current_length; + + free_slots--; + slot++; + + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_NEXT_PRESENT)) + { + if (PREDICT_FALSE (free_slots == 0)) + { + /* revert to last fully processed packet */ + free_slots += slots_in_packet; + slot -= slots_in_packet; + goto no_free_slots; + } + + d0->flags = MEMIF_DESC_FLAG_NEXT; + bi0 = b0->next_buffer; + + /* next */ + slots_in_packet++; + goto next_in_chain; + } + + d0->flags = 0; + + /* next from */ + buffers++; + n_left--; + } +no_free_slots: + + CLIB_MEMORY_STORE_BARRIER (); + ring->head = slot; + + if (n_left && n_retries--) + goto retry; + + clib_spinlock_unlock_if_init (&mif->lockp); + + if (n_left) + { + vlib_error_count (vm, node->node_index, MEMIF_TX_ERROR_NO_FREE_SLOTS, + n_left); + vlib_buffer_free (vm, buffers, n_left); + } + + if ((ring->flags & MEMIF_RING_FLAG_MASK_INT) == 0 && mq->int_fd > -1) + { + u64 b = 1; + CLIB_UNUSED (int r) = write (mq->int_fd, &b, sizeof (b)); + mq->int_count++; + } + + return frame->n_vectors; +} + uword CLIB_MULTIARCH_FN (memif_interface_tx) (vlib_main_t * vm, vlib_node_runtime_t * node, @@ -315,11 +407,29 @@ CLIB_MULTIARCH_FN (memif_interface_tx) (vlib_main_t * vm, memif_main_t *nm = &memif_main; vnet_interface_output_runtime_t *rund = (void *) node->runtime_data; memif_if_t *mif = pool_elt_at_index (nm->interfaces, rund->dev_instance); + memif_queue_t *mq; + u32 thread_index = vlib_get_thread_index (); + memif_per_thread_data_t *ptd = vec_elt_at_index (memif_main.per_thread_data, + thread_index); + u8 tx_queues = vec_len (mif->tx_queues); + + if (tx_queues < vec_len (vlib_mains)) + { + ASSERT (tx_queues > 0); + mq = vec_elt_at_index (mif->tx_queues, thread_index % tx_queues); + clib_spinlock_lock_if_init (&mif->lockp); + } + else + mq = vec_elt_at_index (mif->tx_queues, thread_index); - if (mif->flags & MEMIF_IF_FLAG_IS_SLAVE) - return memif_interface_tx_inline (vm, node, frame, mif, MEMIF_RING_S2M); + if (mif->flags & MEMIF_IF_FLAG_ZERO_COPY) + return memif_interface_tx_zc_inline (vm, node, frame, mif, mq, ptd); + else if (mif->flags & MEMIF_IF_FLAG_IS_SLAVE) + return memif_interface_tx_inline (vm, node, frame, mif, MEMIF_RING_S2M, + mq, ptd); else - return memif_interface_tx_inline (vm, node, frame, mif, MEMIF_RING_M2S); + return memif_interface_tx_inline (vm, node, frame, mif, MEMIF_RING_M2S, + mq, ptd); } static __clib_unused void diff --git a/src/plugins/memif/memif.c b/src/plugins/memif/memif.c index c83a955090c..f533ab97e80 100644 --- a/src/plugins/memif/memif.c +++ b/src/plugins/memif/memif.c @@ -110,36 +110,42 @@ memif_disconnect (memif_if_t * mif, clib_error_t * err) clib_mem_free (mif->sock); } + /* *INDENT-OFF* */ vec_foreach_index (i, mif->rx_queues) - { - mq = vec_elt_at_index (mif->rx_queues, i); - if (mq->ring) - { - int rv; - rv = vnet_hw_interface_unassign_rx_thread (vnm, mif->hw_if_index, i); - if (rv) - DBG ("Warning: unable to unassign interface %d, " - "queue %d: rc=%d", mif->hw_if_index, i, rv); - mq->ring = 0; - } - } + { + mq = vec_elt_at_index (mif->rx_queues, i); + if (mq->ring) + { + int rv; + rv = vnet_hw_interface_unassign_rx_thread (vnm, mif->hw_if_index, i); + if (rv) + DBG ("Warning: unable to unassign interface %d, " + "queue %d: rc=%d", mif->hw_if_index, i, rv); + mq->ring = 0; + } + } /* free tx and rx queues */ - vec_foreach (mq, mif->rx_queues) memif_queue_intfd_close (mq); + vec_foreach (mq, mif->rx_queues) + memif_queue_intfd_close (mq); vec_free (mif->rx_queues); - vec_foreach (mq, mif->tx_queues) memif_queue_intfd_close (mq); + vec_foreach (mq, mif->tx_queues) + memif_queue_intfd_close (mq); vec_free (mif->tx_queues); /* free memory regions */ vec_foreach (mr, mif->regions) - { - int rv; - if ((rv = munmap (mr->shm, mr->region_size))) - clib_warning ("munmap failed, rv = %d", rv); - if (mr->fd > -1) - close (mr->fd); - } + { + int rv; + if (mr->is_external) + continue; + if ((rv = munmap (mr->shm, mr->region_size))) + clib_warning ("munmap failed, rv = %d", rv); + if (mr->fd > -1) + close (mr->fd); + } + /* *INDENT-ON* */ vec_free (mif->regions); vec_free (mif->remote_name); vec_free (mif->remote_if_name); @@ -184,66 +190,70 @@ memif_connect (memif_if_t * mif) vec_free (mif->local_disc_string); vec_free (mif->remote_disc_string); + /* *INDENT-OFF* */ vec_foreach (mr, mif->regions) - { - if (mr->shm) - continue; + { + if (mr->shm) + continue; - if (mr->fd < 0) - clib_error_return (0, "no memory region fd"); + if (mr->fd < 0) + clib_error_return (0, "no memory region fd"); - if ((mr->shm = mmap (NULL, mr->region_size, PROT_READ | PROT_WRITE, - MAP_SHARED, mr->fd, 0)) == MAP_FAILED) - return clib_error_return_unix (0, "mmap"); - } + if ((mr->shm = mmap (NULL, mr->region_size, PROT_READ | PROT_WRITE, + MAP_SHARED, mr->fd, 0)) == MAP_FAILED) + return clib_error_return_unix (0, "mmap"); + } + /* *INDENT-ON* */ template.read_function = memif_int_fd_read_ready; + /* *INDENT-OFF* */ vec_foreach_index (i, mif->tx_queues) - { - memif_queue_t *mq = vec_elt_at_index (mif->tx_queues, i); + { + memif_queue_t *mq = vec_elt_at_index (mif->tx_queues, i); - mq->ring = mif->regions[mq->region].shm + mq->offset; - if (mq->ring->cookie != MEMIF_COOKIE) - return clib_error_return (0, "wrong cookie on tx ring %u", i); - } + mq->ring = mif->regions[mq->region].shm + mq->offset; + if (mq->ring->cookie != MEMIF_COOKIE) + return clib_error_return (0, "wrong cookie on tx ring %u", i); + } vec_foreach_index (i, mif->rx_queues) - { - memif_queue_t *mq = vec_elt_at_index (mif->rx_queues, i); - int rv; - - mq->ring = mif->regions[mq->region].shm + mq->offset; - if (mq->ring->cookie != MEMIF_COOKIE) - return clib_error_return (0, "wrong cookie on tx ring %u", i); - - if (mq->int_fd > -1) - { - template.file_descriptor = mq->int_fd; - template.private_data = (mif->dev_instance << 16) | (i & 0xFFFF); - template.description = format (0, "%U rx %u int", - format_memif_device_name, - mif->dev_instance, i); - memif_file_add (&mq->int_clib_file_index, &template); - } - vnet_hw_interface_assign_rx_thread (vnm, mif->hw_if_index, i, ~0); - rv = vnet_hw_interface_set_rx_mode (vnm, mif->hw_if_index, i, - VNET_HW_INTERFACE_RX_MODE_DEFAULT); - if (rv) - clib_warning - ("Warning: unable to set rx mode for interface %d queue %d: " - "rc=%d", mif->hw_if_index, i, rv); - else - { - vnet_hw_interface_rx_mode rxmode; - vnet_hw_interface_get_rx_mode (vnm, mif->hw_if_index, i, &rxmode); - - if (rxmode == VNET_HW_INTERFACE_RX_MODE_POLLING) - mq->ring->flags |= MEMIF_RING_FLAG_MASK_INT; - else - vnet_device_input_set_interrupt_pending (vnm, mif->hw_if_index, i); - } - } + { + memif_queue_t *mq = vec_elt_at_index (mif->rx_queues, i); + int rv; + + mq->ring = mif->regions[mq->region].shm + mq->offset; + if (mq->ring->cookie != MEMIF_COOKIE) + return clib_error_return (0, "wrong cookie on tx ring %u", i); + + if (mq->int_fd > -1) + { + template.file_descriptor = mq->int_fd; + template.private_data = (mif->dev_instance << 16) | (i & 0xFFFF); + template.description = format (0, "%U rx %u int", + format_memif_device_name, + mif->dev_instance, i); + memif_file_add (&mq->int_clib_file_index, &template); + } + vnet_hw_interface_assign_rx_thread (vnm, mif->hw_if_index, i, ~0); + rv = vnet_hw_interface_set_rx_mode (vnm, mif->hw_if_index, i, + VNET_HW_INTERFACE_RX_MODE_DEFAULT); + if (rv) + clib_warning + ("Warning: unable to set rx mode for interface %d queue %d: " + "rc=%d", mif->hw_if_index, i, rv); + else + { + vnet_hw_interface_rx_mode rxmode; + vnet_hw_interface_get_rx_mode (vnm, mif->hw_if_index, i, &rxmode); + + if (rxmode == VNET_HW_INTERFACE_RX_MODE_POLLING) + mq->ring->flags |= MEMIF_RING_FLAG_MASK_INT; + else + vnet_device_input_set_interrupt_pending (vnm, mif->hw_if_index, i); + } + } + /* *INDENT-ON* */ mif->flags &= ~MEMIF_IF_FLAG_CONNECTING; mif->flags |= MEMIF_IF_FLAG_CONNECTED; @@ -270,6 +280,7 @@ memif_get_ring (memif_if_t * mif, memif_ring_type_t type, u16 ring_num) clib_error_t * memif_init_regions_and_queues (memif_if_t * mif) { + vlib_main_t *vm = vlib_get_main (); memif_ring_t *ring = NULL; int i, j; u64 buffer_offset; @@ -277,16 +288,18 @@ memif_init_regions_and_queues (memif_if_t * mif) clib_mem_vm_alloc_t alloc = { 0 }; clib_error_t *err; - vec_validate_aligned (mif->regions, 0, CLIB_CACHE_LINE_BYTES); - r = vec_elt_at_index (mif->regions, 0); + ASSERT (vec_len (mif->regions) == 0); + vec_add2_aligned (mif->regions, r, 1, CLIB_CACHE_LINE_BYTES); buffer_offset = (mif->run.num_s2m_rings + mif->run.num_m2s_rings) * (sizeof (memif_ring_t) + sizeof (memif_desc_t) * (1 << mif->run.log2_ring_size)); - r->region_size = buffer_offset + - mif->run.buffer_size * (1 << mif->run.log2_ring_size) * - (mif->run.num_s2m_rings + mif->run.num_m2s_rings); + r->region_size = buffer_offset; + + if ((mif->flags & MEMIF_IF_FLAG_ZERO_COPY) == 0) + r->region_size += mif->run.buffer_size * (1 << mif->run.log2_ring_size) * + (mif->run.num_s2m_rings + mif->run.num_m2s_rings); alloc.name = "memif region"; alloc.size = r->region_size; @@ -299,11 +312,32 @@ memif_init_regions_and_queues (memif_if_t * mif) r->fd = alloc.fd; r->shm = alloc.addr; + if (mif->flags & MEMIF_IF_FLAG_ZERO_COPY) + { + vlib_buffer_pool_t *bp; + /* *INDENT-OFF* */ + vec_foreach (bp, buffer_main.buffer_pools) + { + vlib_physmem_region_t *pr; + pr = vlib_physmem_get_region (vm, bp->physmem_region); + vec_add2_aligned (mif->regions, r, 1, CLIB_CACHE_LINE_BYTES); + r->fd = pr->fd; + r->region_size = pr->size; + r->shm = pr->mem; + r->is_external = 1; + } + /* *INDENT-ON* */ + } + for (i = 0; i < mif->run.num_s2m_rings; i++) { ring = memif_get_ring (mif, MEMIF_RING_S2M, i); ring->head = ring->tail = 0; ring->cookie = MEMIF_COOKIE; + + if (mif->flags & MEMIF_IF_FLAG_ZERO_COPY) + continue; + for (j = 0; j < (1 << mif->run.log2_ring_size); j++) { u16 slot = i * (1 << mif->run.log2_ring_size) + j; @@ -318,6 +352,10 @@ memif_init_regions_and_queues (memif_if_t * mif) ring = memif_get_ring (mif, MEMIF_RING_M2S, i); ring->head = ring->tail = 0; ring->cookie = MEMIF_COOKIE; + + if (mif->flags & MEMIF_IF_FLAG_ZERO_COPY) + continue; + for (j = 0; j < (1 << mif->run.log2_ring_size); j++) { u16 slot = @@ -332,36 +370,48 @@ memif_init_regions_and_queues (memif_if_t * mif) ASSERT (mif->tx_queues == 0); vec_validate_aligned (mif->tx_queues, mif->run.num_s2m_rings - 1, CLIB_CACHE_LINE_BYTES); + + /* *INDENT-OFF* */ vec_foreach_index (i, mif->tx_queues) - { - memif_queue_t *mq = vec_elt_at_index (mif->tx_queues, i); - if ((mq->int_fd = eventfd (0, EFD_NONBLOCK)) < 0) - return clib_error_return_unix (0, "eventfd[tx queue %u]", i); - mq->int_clib_file_index = ~0; - mq->ring = memif_get_ring (mif, MEMIF_RING_S2M, i); - mq->log2_ring_size = mif->cfg.log2_ring_size; - mq->region = 0; - mq->offset = (void *) mq->ring - (void *) mif->regions[mq->region].shm; - mq->last_head = 0; - mq->type = MEMIF_RING_S2M; - } + { + memif_queue_t *mq = vec_elt_at_index (mif->tx_queues, i); + if ((mq->int_fd = eventfd (0, EFD_NONBLOCK)) < 0) + return clib_error_return_unix (0, "eventfd[tx queue %u]", i); + mq->int_clib_file_index = ~0; + mq->ring = memif_get_ring (mif, MEMIF_RING_S2M, i); + mq->log2_ring_size = mif->cfg.log2_ring_size; + mq->region = 0; + mq->offset = (void *) mq->ring - (void *) mif->regions[mq->region].shm; + mq->last_head = 0; + mq->type = MEMIF_RING_S2M; + if (mif->flags & MEMIF_IF_FLAG_ZERO_COPY) + vec_validate_aligned (mq->buffers, 1 << mq->log2_ring_size, + CLIB_CACHE_LINE_BYTES); + } + /* *INDENT-ON* */ ASSERT (mif->rx_queues == 0); vec_validate_aligned (mif->rx_queues, mif->run.num_m2s_rings - 1, CLIB_CACHE_LINE_BYTES); + + /* *INDENT-OFF* */ vec_foreach_index (i, mif->rx_queues) - { - memif_queue_t *mq = vec_elt_at_index (mif->rx_queues, i); - if ((mq->int_fd = eventfd (0, EFD_NONBLOCK)) < 0) - return clib_error_return_unix (0, "eventfd[rx queue %u]", i); - mq->int_clib_file_index = ~0; - mq->ring = memif_get_ring (mif, MEMIF_RING_M2S, i); - mq->log2_ring_size = mif->cfg.log2_ring_size; - mq->region = 0; - mq->offset = (void *) mq->ring - (void *) mif->regions[mq->region].shm; - mq->last_head = 0; - mq->type = MEMIF_RING_M2S; - } + { + memif_queue_t *mq = vec_elt_at_index (mif->rx_queues, i); + if ((mq->int_fd = eventfd (0, EFD_NONBLOCK)) < 0) + return clib_error_return_unix (0, "eventfd[rx queue %u]", i); + mq->int_clib_file_index = ~0; + mq->ring = memif_get_ring (mif, MEMIF_RING_M2S, i); + mq->log2_ring_size = mif->cfg.log2_ring_size; + mq->region = 0; + mq->offset = (void *) mq->ring - (void *) mif->regions[mq->region].shm; + mq->last_head = 0; + mq->type = MEMIF_RING_M2S; + if (mif->flags & MEMIF_IF_FLAG_ZERO_COPY) + vec_validate_aligned (mq->buffers, 1 << mq->log2_ring_size, + CLIB_CACHE_LINE_BYTES); + } + /* *INDENT-ON* */ return 0; } @@ -616,10 +666,10 @@ memif_delete_if (vlib_main_t * vm, memif_if_t * mif) if (msf->is_listener) { int i; + /* *INDENT-OFF* */ vec_foreach_index (i, msf->pending_clients) - { memif_socket_close (msf->pending_clients + i); - } + /* *INDENT-ON* */ memif_socket_close (&msf->sock); vec_free (msf->pending_clients); } @@ -854,7 +904,11 @@ memif_create_if (vlib_main_t * vm, memif_create_if_args_t * args) msf->ref_cnt++; if (args->is_master == 0) - mif->flags |= MEMIF_IF_FLAG_IS_SLAVE; + { + mif->flags |= MEMIF_IF_FLAG_IS_SLAVE; + if (args->is_zero_copy) + mif->flags |= MEMIF_IF_FLAG_ZERO_COPY; + } hw = vnet_get_hw_interface (vnm, mif->hw_if_index); hw->flags |= VNET_HW_INTERFACE_FLAG_SUPPORTS_INT_MODE; diff --git a/src/plugins/memif/node.c b/src/plugins/memif/node.c index e8223d12435..e1ee63b47bb 100644 --- a/src/plugins/memif/node.c +++ b/src/plugins/memif/node.c @@ -141,6 +141,35 @@ memif_add_to_chain (vlib_main_t * vm, vlib_buffer_t * b, u32 * buffers, } } +static_always_inline u32 +sat_sub (u32 x, u32 y) +{ + u32 res = x - y; + res &= -(res <= x); + return res; +} + +/* branchless validation of the descriptor - uses saturated subtraction */ +static_always_inline u32 +memif_desc_is_invalid (memif_if_t * mif, memif_desc_t * d, u32 buffer_length) +{ + u32 rv; + u16 valid_flags = MEMIF_DESC_FLAG_NEXT; + + rv = d->flags & (~valid_flags); + rv |= sat_sub (d->region + 1, vec_len (mif->regions)); + rv |= sat_sub (d->length, buffer_length); + rv |= sat_sub (d->offset + d->length, mif->regions[d->region].region_size); + + if (PREDICT_FALSE (rv)) + { + mif->flags |= MEMIF_IF_FLAG_ERROR; + return 1; + } + + return 0; +} + static_always_inline uword memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame, memif_if_t * mif, @@ -182,7 +211,6 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, so start with IP header at offset 14 */ start_offset = (mode == MEMIF_INTERFACE_MODE_IP) ? 14 : 0; - /* for S2M rings, we are consumers of packet buffers, and for M2S rings we are producers of empty buffers */ cur_slot = (type == MEMIF_RING_S2M) ? mq->last_head : mq->last_tail; @@ -492,6 +520,289 @@ refill: return n_rx_packets; } +static_always_inline uword +memif_device_input_zc_inline (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame, memif_if_t * mif, + u16 qid, memif_interface_mode_t mode) +{ + vnet_main_t *vnm = vnet_get_main (); + memif_main_t *mm = &memif_main; + memif_ring_t *ring; + memif_queue_t *mq; + u32 next_index; + uword n_trace = vlib_get_trace_count (vm, node); + u32 n_rx_packets = 0, n_rx_bytes = 0; + u32 *to_next = 0, *buffers; + u32 bi0, bi1, bi2, bi3; + vlib_buffer_t *b0, *b1, *b2, *b3; + u32 thread_index = vlib_get_thread_index (); + memif_per_thread_data_t *ptd = vec_elt_at_index (mm->per_thread_data, + thread_index); + vlib_buffer_t *bt = &ptd->buffer_template; + u16 cur_slot, last_slot, ring_size, n_slots, mask, head; + i16 start_offset; + u32 buffer_length; + u16 n_alloc; + + mq = vec_elt_at_index (mif->rx_queues, qid); + ring = mq->ring; + ring_size = 1 << mq->log2_ring_size; + mask = ring_size - 1; + + next_index = (mode == MEMIF_INTERFACE_MODE_IP) ? + VNET_DEVICE_INPUT_NEXT_IP6_INPUT : VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT; + + /* asume that somebody will want to add ethernet header on the packet + so start with IP header at offset 14 */ + start_offset = (mode == MEMIF_INTERFACE_MODE_IP) ? 14 : 0; + buffer_length = VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES - start_offset; + + cur_slot = mq->last_tail; + last_slot = ring->tail; + if (cur_slot == last_slot) + goto refill; + n_slots = last_slot - cur_slot; + + /* process ring slots */ + vec_validate_aligned (ptd->buffers, MEMIF_RX_VECTOR_SZ, + CLIB_CACHE_LINE_BYTES); + while (n_slots && n_rx_packets < MEMIF_RX_VECTOR_SZ) + { + u16 s0; + memif_desc_t *d0; + vlib_buffer_t *hb; + + s0 = cur_slot & mask; + bi0 = mq->buffers[s0]; + ptd->buffers[n_rx_packets++] = bi0; + + CLIB_PREFETCH (&ring->desc[(cur_slot + 8) & mask], + CLIB_CACHE_LINE_BYTES, LOAD); + d0 = &ring->desc[s0]; + hb = b0 = vlib_get_buffer (vm, bi0); + b0->current_data = start_offset; + b0->current_length = start_offset + d0->length; + + + if (0 && memif_desc_is_invalid (mif, d0, buffer_length)) + return 0; + + cur_slot++; + n_slots--; + if (PREDICT_FALSE ((d0->flags & MEMIF_DESC_FLAG_NEXT) && n_slots)) + { + hb->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID; + next_slot: + s0 = cur_slot & mask; + d0 = &ring->desc[s0]; + bi0 = mq->buffers[s0]; + + /*previous buffer */ + b0->next_buffer = bi0; + b0->flags |= VLIB_BUFFER_NEXT_PRESENT; + + /* current buffer */ + b0 = vlib_get_buffer (vm, bi0); + b0->current_data = start_offset; + b0->current_length = start_offset + d0->length; + hb->total_length_not_including_first_buffer += d0->length; + + cur_slot++; + n_slots--; + if ((d0->flags & MEMIF_DESC_FLAG_NEXT) && n_slots) + goto next_slot; + } + } + + /* release slots from the ring */ + mq->last_tail = cur_slot; + + u32 n_from = n_rx_packets; + + vnet_buffer (bt)->sw_if_index[VLIB_RX] = mif->sw_if_index; + + buffers = ptd->buffers; + + while (n_from) + { + u32 n_left_to_next; + u32 next0, next1, next2, next3; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + while (n_from >= 8 && n_left_to_next >= 4) + { + b0 = vlib_get_buffer (vm, buffers[4]); + b1 = vlib_get_buffer (vm, buffers[5]); + b2 = vlib_get_buffer (vm, buffers[6]); + b3 = vlib_get_buffer (vm, buffers[7]); + vlib_prefetch_buffer_header (b0, STORE); + vlib_prefetch_buffer_header (b1, STORE); + vlib_prefetch_buffer_header (b2, STORE); + vlib_prefetch_buffer_header (b3, STORE); + + /* enqueue buffer */ + to_next[0] = bi0 = buffers[0]; + to_next[1] = bi1 = buffers[1]; + to_next[2] = bi2 = buffers[2]; + to_next[3] = bi3 = buffers[3]; + to_next += 4; + n_left_to_next -= 4; + buffers += 4; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + b2 = vlib_get_buffer (vm, bi2); + b3 = vlib_get_buffer (vm, bi3); + + vnet_buffer (b0)->sw_if_index[VLIB_RX] = mif->sw_if_index; + vnet_buffer (b0)->sw_if_index[VLIB_TX] = ~0; + vnet_buffer (b1)->sw_if_index[VLIB_RX] = mif->sw_if_index; + vnet_buffer (b1)->sw_if_index[VLIB_TX] = ~0; + vnet_buffer (b2)->sw_if_index[VLIB_RX] = mif->sw_if_index; + vnet_buffer (b2)->sw_if_index[VLIB_TX] = ~0; + vnet_buffer (b3)->sw_if_index[VLIB_RX] = mif->sw_if_index; + vnet_buffer (b3)->sw_if_index[VLIB_TX] = ~0; + + if (mode == MEMIF_INTERFACE_MODE_IP) + { + next0 = memif_next_from_ip_hdr (node, b0); + next1 = memif_next_from_ip_hdr (node, b1); + next2 = memif_next_from_ip_hdr (node, b2); + next3 = memif_next_from_ip_hdr (node, b3); + } + else if (mode == MEMIF_INTERFACE_MODE_ETHERNET) + { + if (PREDICT_FALSE (mif->per_interface_next_index != ~0)) + { + next0 = mif->per_interface_next_index; + next1 = mif->per_interface_next_index; + next2 = mif->per_interface_next_index; + next3 = mif->per_interface_next_index; + } + else + { + next0 = next1 = next2 = next3 = next_index; + /* redirect if feature path enabled */ + vnet_feature_start_device_input_x1 (mif->sw_if_index, + &next0, b0); + vnet_feature_start_device_input_x1 (mif->sw_if_index, + &next1, b1); + vnet_feature_start_device_input_x1 (mif->sw_if_index, + &next2, b2); + vnet_feature_start_device_input_x1 (mif->sw_if_index, + &next3, b3); + } + } + + /* trace */ + if (PREDICT_FALSE (n_trace > 0)) + { + memif_trace_buffer (vm, node, mif, b0, next0, qid, &n_trace); + if (PREDICT_FALSE (n_trace > 0)) + memif_trace_buffer (vm, node, mif, b1, next1, qid, &n_trace); + if (PREDICT_FALSE (n_trace > 0)) + memif_trace_buffer (vm, node, mif, b2, next2, qid, &n_trace); + if (PREDICT_FALSE (n_trace > 0)) + memif_trace_buffer (vm, node, mif, b3, next3, qid, &n_trace); + } + + /* enqueue */ + vlib_validate_buffer_enqueue_x4 (vm, node, next_index, to_next, + n_left_to_next, bi0, bi1, bi2, bi3, + next0, next1, next2, next3); + + /* next */ + n_from -= 4; + } + while (n_from && n_left_to_next) + { + /* enqueue buffer */ + to_next[0] = bi0 = buffers[0]; + to_next += 1; + n_left_to_next--; + buffers += 1; + + b0 = vlib_get_buffer (vm, bi0); + vnet_buffer (b0)->sw_if_index[VLIB_RX] = mif->sw_if_index; + vnet_buffer (b0)->sw_if_index[VLIB_TX] = ~0; + + if (mode == MEMIF_INTERFACE_MODE_IP) + { + next0 = memif_next_from_ip_hdr (node, b0); + } + else if (mode == MEMIF_INTERFACE_MODE_ETHERNET) + { + if (PREDICT_FALSE (mif->per_interface_next_index != ~0)) + next0 = mif->per_interface_next_index; + else + { + next0 = next_index; + /* redirect if feature path enabled */ + vnet_feature_start_device_input_x1 (mif->sw_if_index, + &next0, b0); + } + } + + /* trace */ + if (PREDICT_FALSE (n_trace > 0)) + memif_trace_buffer (vm, node, mif, b0, next0, qid, &n_trace); + + /* enqueue */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + + /* next */ + n_from--; + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + vlib_increment_combined_counter (vnm->interface_main.combined_sw_if_counters + + VNET_INTERFACE_COUNTER_RX, thread_index, + mif->hw_if_index, n_rx_packets, + n_rx_bytes); + + /* refill ring with empty buffers */ +refill: + vec_reset_length (ptd->buffers); + + head = ring->head; + n_slots = ring_size - head + mq->last_tail; + + if (n_slots < 8) + goto done; + + memif_desc_t *dt = &ptd->desc_template; + memset (dt, 0, sizeof (memif_desc_t)); + dt->length = VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES - start_offset; + + n_alloc = vlib_buffer_alloc_to_ring (vm, mq->buffers, head & mask, + ring_size, n_slots); + + if (PREDICT_FALSE (n_alloc != n_slots)) + { + vlib_error_count (vm, node->node_index, + MEMIF_INPUT_ERROR_BUFFER_ALLOC_FAIL, 1); + } + + while (n_alloc--) + { + u16 s = head++ & mask; + memif_desc_t *d = &ring->desc[s]; + clib_memcpy (d, dt, sizeof (memif_desc_t)); + b0 = vlib_get_buffer (vm, mq->buffers[s]); + d->region = b0->buffer_pool_index + 1; + d->offset = + (void *) b0->data - mif->regions[d->region].shm + start_offset; + } + + CLIB_MEMORY_STORE_BARRIER (); + ring->head = head; + +done: + return n_rx_packets; +} + uword CLIB_MULTIARCH_FN (memif_input_fn) (vlib_main_t * vm, vlib_node_runtime_t * node, @@ -501,6 +812,8 @@ CLIB_MULTIARCH_FN (memif_input_fn) (vlib_main_t * vm, memif_main_t *mm = &memif_main; vnet_device_input_runtime_t *rt = (void *) node->runtime_data; vnet_device_and_queue_t *dq; + memif_interface_mode_t mode_ip = MEMIF_INTERFACE_MODE_IP; + memif_interface_mode_t mode_eth = MEMIF_INTERFACE_MODE_ETHERNET; foreach_device_and_queue (dq, rt->devices_and_queues) { @@ -509,27 +822,36 @@ CLIB_MULTIARCH_FN (memif_input_fn) (vlib_main_t * vm, if ((mif->flags & MEMIF_IF_FLAG_ADMIN_UP) && (mif->flags & MEMIF_IF_FLAG_CONNECTED)) { + if (mif->flags & MEMIF_IF_FLAG_ZERO_COPY) + { + if (mif->mode == MEMIF_INTERFACE_MODE_IP) + n_rx += memif_device_input_zc_inline (vm, node, frame, mif, + dq->queue_id, mode_ip); + else + n_rx += memif_device_input_zc_inline (vm, node, frame, mif, + dq->queue_id, mode_eth); + } if (mif->flags & MEMIF_IF_FLAG_IS_SLAVE) { if (mif->mode == MEMIF_INTERFACE_MODE_IP) n_rx += memif_device_input_inline (vm, node, frame, mif, MEMIF_RING_M2S, dq->queue_id, - MEMIF_INTERFACE_MODE_IP); + mode_ip); else n_rx += memif_device_input_inline (vm, node, frame, mif, MEMIF_RING_M2S, dq->queue_id, - MEMIF_INTERFACE_MODE_ETHERNET); + mode_eth); } else { if (mif->mode == MEMIF_INTERFACE_MODE_IP) n_rx += memif_device_input_inline (vm, node, frame, mif, MEMIF_RING_S2M, dq->queue_id, - MEMIF_INTERFACE_MODE_IP); + mode_ip); else n_rx += memif_device_input_inline (vm, node, frame, mif, MEMIF_RING_S2M, dq->queue_id, - MEMIF_INTERFACE_MODE_ETHERNET); + mode_eth); } } } diff --git a/src/plugins/memif/private.h b/src/plugins/memif/private.h index f4ace4cee5d..ad6295e4bb8 100644 --- a/src/plugins/memif/private.h +++ b/src/plugins/memif/private.h @@ -83,6 +83,7 @@ typedef struct void *shm; memif_region_size_t region_size; int fd; + u8 is_external; } memif_region_t; typedef struct @@ -101,6 +102,7 @@ typedef struct u16 last_head; u16 last_tail; + u32 *buffers; /* interrupts */ int int_fd; @@ -116,7 +118,9 @@ typedef struct _(1, IS_SLAVE, "slave") \ _(2, CONNECTING, "connecting") \ _(3, CONNECTED, "connected") \ - _(4, DELETING, "deleting") + _(4, DELETING, "deleting") \ + _(5, ZERO_COPY, "zero-copy") \ + _(6, ERROR, "error") typedef enum { @@ -201,6 +205,7 @@ typedef struct /* buffer template */ vlib_buffer_t buffer_template; + memif_desc_t desc_template; } memif_per_thread_data_t; typedef struct @@ -238,6 +243,7 @@ typedef struct u32 socket_id; u8 *secret; u8 is_master; + u8 is_zero_copy; memif_interface_mode_t mode:8; memif_log2_ring_size_t log2_ring_size; u16 buffer_size; diff --git a/src/plugins/memif/socket.c b/src/plugins/memif/socket.c index 34bff166d79..39ff99e3d8d 100644 --- a/src/plugins/memif/socket.c +++ b/src/plugins/memif/socket.c @@ -437,11 +437,14 @@ memif_msg_receive (memif_if_t ** mifp, clib_socket_t * sock, clib_file_t * uf) if ((err = memif_init_regions_and_queues (mif))) return err; memif_msg_enq_init (mif); - memif_msg_enq_add_region (mif, 0); + /* *INDENT-OFF* */ + vec_foreach_index (i, mif->regions) + memif_msg_enq_add_region (mif, i); vec_foreach_index (i, mif->tx_queues) memif_msg_enq_add_ring (mif, i, MEMIF_RING_S2M); vec_foreach_index (i, mif->rx_queues) memif_msg_enq_add_ring (mif, i, MEMIF_RING_M2S); + /* *INDENT-ON* */ memif_msg_enq_connect (mif); break; diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index 0ac05f3f703..ba87d9566c7 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -459,6 +459,30 @@ vlib_buffer_free_from_ring (vlib_main_t * vm, u32 * ring, u32 start, } } +/** \brief Free buffers from ring without freeing tail buffers + + @param vm - (vlib_main_t *) vlib main data structure pointer + @param buffers - (u32 * ) buffer index ring + @param start - (u32) first slot in the ring + @param ring_size - (u32) ring size + @param n_buffers - (u32) number of buffers +*/ +always_inline void +vlib_buffer_free_from_ring_no_next (vlib_main_t * vm, u32 * ring, u32 start, + u32 ring_size, u32 n_buffers) +{ + ASSERT (n_buffers <= ring_size); + + if (PREDICT_TRUE (start + n_buffers <= ring_size)) + { + vlib_buffer_free (vm, ring + start, n_buffers); + } + else + { + vlib_buffer_free_no_next (vm, ring + start, ring_size - start); + vlib_buffer_free_no_next (vm, ring, n_buffers - (ring_size - start)); + } +} /* Add/delete buffer free lists. */ vlib_buffer_free_list_index_t vlib_buffer_create_free_list (vlib_main_t * vm, -- cgit 1.2.3-korg