aboutsummaryrefslogtreecommitdiffstats
path: root/vnet/vnet/devices/dpdk/dpdk.h
blob: fd984e4d4df3ed51543c2ab7096b78ab5b2c57dc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
/*
 * Copyright (c) 2015 Cisco and/or its affiliates.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
#ifndef __included_dpdk_h__
#define __included_dpdk_h__

/* $$$$ We should rename always_inline -> clib_always_inline */
#undef always_inline

#include <rte_config.h>

#include <rte_common.h>
#include <rte_dev.h>
#include <rte_log.h>
#include <rte_memory.h>
#include <rte_memcpy.h>
#include <rte_memzone.h>
#include <rte_tailq.h>
#include <rte_eal.h>
#include <rte_per_lcore.h>
#include <rte_launch.h>
#include <rte_atomic.h>
#include <rte_cycles.h>
#include <rte_prefetch.h>
#include <rte_lcore.h>
#include <rte_per_lcore.h>
#include <rte_branch_prediction.h>
#include <rte_interrupts.h>
#include <rte_pci.h>
#include <rte_random.h>
#include <rte_debug.h>
#include <rte_ether.h>
#include <rte_ethdev.h>
#include <rte_ring.h>
#include <rte_mempool.h>
#include <rte_mbuf.h>
#include <rte_kni.h>
#include <rte_virtio_net.h>
#include <rte_pci_dev_ids.h>
#include <rte_version.h>

#include <vnet/unix/pcap.h>
#include <vnet/devices/virtio/vhost-user.h>

#if CLIB_DEBUG > 0
#define always_inline static inline
#else
#define always_inline static inline __attribute__ ((__always_inline__))
#endif

#define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
#define NB_MBUF   (32<<10)

vnet_device_class_t dpdk_device_class;
vlib_node_registration_t dpdk_input_node;
vlib_node_registration_t dpdk_io_input_node;
vlib_node_registration_t handoff_dispatch_node;

typedef enum {
  VNET_DPDK_DEV_ETH = 1,      /* Standard DPDK PMD driver */
  VNET_DPDK_DEV_KNI,          /* Kernel NIC Interface */
  VNET_DPDK_DEV_VHOST_USER,
  VNET_DPDK_DEV_UNKNOWN,      /* must be last */
} dpdk_device_type_t;

#define foreach_dpdk_pmd          \
  _ ("rte_em_pmd", E1000EM)       \
  _ ("rte_igb_pmd", IGB)          \
  _ ("rte_igbvf_pmd", IGBVF)      \
  _ ("rte_ixgbe_pmd", IXGBE)      \
  _ ("rte_ixgbevf_pmd", IXGBEVF)  \
  _ ("rte_i40e_pmd", I40E)        \
  _ ("rte_i40evf_pmd", I40EVF)    \
  _ ("rte_virtio_pmd", VIRTIO)    \
  _ ("rte_vice_pmd", VICE)        \
  _ ("rte_enic_pmd", ENIC)        \
  _ ("rte_vmxnet3_pmd", VMXNET3)  \
  _ ("AF_PACKET PMD", AF_PACKET)  \
  _ ("rte_pmd_fm10k", FM10K)

typedef enum {
  VNET_DPDK_PMD_NONE,
#define _(s,f) VNET_DPDK_PMD_##f,
  foreach_dpdk_pmd
#undef _
#ifdef NETMAP
  VNET_DPDK_PMD_NETMAP,
#endif
  VNET_DPDK_PMD_UNKNOWN, /* must be last */
} dpdk_pmd_t;

typedef enum {
  VNET_DPDK_PORT_TYPE_ETH_1G,
  VNET_DPDK_PORT_TYPE_ETH_10G,
  VNET_DPDK_PORT_TYPE_ETH_40G,
  VNET_DPDK_PORT_TYPE_ETH_SWITCH,
#ifdef NETMAP
  VNET_DPDK_PORT_TYPE_NETMAP,
#endif
  VNET_DPDK_PORT_TYPE_AF_PACKET,
  VNET_DPDK_PORT_TYPE_UNKNOWN,
} dpdk_port_type_t;

typedef struct {
  f64 deadline;
  vlib_frame_t * frame;
} dpdk_frame_t;

#define DPDK_EFD_MAX_DISCARD_RATE 10

typedef struct {
  u16 last_burst_sz;
  u16 max_burst_sz;
  u32 full_frames_cnt;
  u32 consec_full_frames_cnt;
  u32 congestion_cnt;
  u64 last_poll_time;
  u64 max_poll_delay;
  u32 discard_cnt;
  u32 total_packet_cnt;
} dpdk_efd_agent_t;

typedef struct {
  int callfd;
  int kickfd;
  int errfd;
  u32 callfd_idx;
  u32 n_since_last_int;
  f64 int_deadline;
} dpdk_vu_vring;

typedef struct {
  u32 is_up;
  u32 unix_fd;
  u32 unix_file_index;
  u32 client_fd;
  char sock_filename[256];
  int sock_errno;
  u8 sock_is_server;
  u8 active;

  u64 feature_mask;
  u32 num_vrings;
  dpdk_vu_vring vrings[2];
  u64 region_addr[VHOST_MEMORY_MAX_NREGIONS];
  u32 region_fd[VHOST_MEMORY_MAX_NREGIONS];
} dpdk_vu_intf_t;

typedef void (*dpdk_flowcontrol_callback_t) (vlib_main_t *vm,
                                             u32 hw_if_index,
                                             u32 n_packets);

/*
 * The header for the tx_vector in dpdk_device_t.
 * Head and tail are indexes into the tx_vector and are of type
 * u64 so they never overflow.
 */
typedef struct {
  u64 tx_head;
  u64 tx_tail;
} tx_ring_hdr_t;

typedef struct {
  CLIB_CACHE_LINE_ALIGN_MARK(cacheline0);
  volatile u32 *lockp;

  /* Instance ID */
  u32 device_index;

  u32 vlib_hw_if_index;
  u32 vlib_sw_if_index;

  /* next node index if we decide to steal the rx graph arc */
  u32 per_interface_next_index;

  /* dpdk rte_mbuf rx and tx vectors, VLIB_FRAME_SIZE */
  struct rte_mbuf *** tx_vectors; /* one per worker thread */
  struct rte_mbuf *** rx_vectors;

  /* vector of traced contexts, per device */
  u32 * d_trace_buffers;

  /* per-worker destination frame queue */
  dpdk_frame_t * frames;

  dpdk_device_type_t dev_type:8;
  dpdk_pmd_t pmd:8;
  i8 cpu_socket;

  u8 admin_up;
  u8 promisc;

  CLIB_CACHE_LINE_ALIGN_MARK(cacheline1);

  /* PMD related */
  u16 tx_q_used;
  u16 rx_q_used;
  u16 nb_rx_desc;
  u16 nb_tx_desc;
  u16 * cpu_socket_id_by_queue;
  struct rte_eth_conf port_conf;
  struct rte_eth_txconf tx_conf;

  /* KNI related */
  struct rte_kni *kni;
  u8 kni_port_id;

  /* vhost-user related */
  u32 vu_if_id;
  struct virtio_net  vu_vhost_dev;
  u32 vu_is_running;
  dpdk_vu_intf_t *vu_intf;

  /* af_packet */
  u8 af_packet_port_id;

  struct rte_eth_link link;
  f64 time_last_link_update;

  struct rte_eth_stats stats;
  struct rte_eth_stats last_stats;
  struct rte_eth_xstats * xstats;
  f64 time_last_stats_update;
  dpdk_port_type_t port_type;

  dpdk_efd_agent_t efd_agent;
} dpdk_device_t;

#define MAX_NELTS 32
typedef struct {
  CLIB_CACHE_LINE_ALIGN_MARK(cacheline0);
  u64 head;
  u64 head_hint;
  u64 tail;
  u32 n_in_use;
  u32 nelts;
  u32 written;
  u32 threshold;
  i32 n_vectors[MAX_NELTS];
} frame_queue_trace_t;

#define DPDK_TX_RING_SIZE (4 * 1024)

#define DPDK_STATS_POLL_INTERVAL  10.0
#define DPDK_LINK_POLL_INTERVAL   3.0

typedef struct {
  CLIB_CACHE_LINE_ALIGN_MARK(cacheline0);

  /* total input packet counter */
  u64 aggregate_rx_packets;
} dpdk_worker_t;

typedef struct {
  u32 device;
  u16 queue_id;
} dpdk_device_and_queue_t;

/* Early-Fast-Discard (EFD) */
#define DPDK_EFD_DISABLED                       0
#define DPDK_EFD_DISCARD_ENABLED                (1 << 0)
#define DPDK_EFD_MONITOR_ENABLED                (1 << 1)
#define DPDK_EFD_DROPALL_ENABLED                (1 << 2)

#define DPDK_EFD_DEFAULT_DEVICE_QUEUE_HI_THRESH_PCT    90
#define DPDK_EFD_DEFAULT_CONSEC_FULL_FRAMES_HI_THRESH  6

typedef struct dpdk_efd_t {
  u16 enabled;
  u16 queue_hi_thresh;
  u16 consec_full_frames_hi_thresh;
  u16 pad;
} dpdk_efd_t;

typedef struct {

  /* Devices */
  dpdk_device_t * devices;
  dpdk_device_and_queue_t ** devices_by_cpu;

  /* per-thread recycle lists */
  u32 ** recycle;

  /* flow control callback. If 0 then flow control is disabled */
  dpdk_flowcontrol_callback_t flowcontrol_callback;

  /* vlib buffer free list, must be same size as an rte_mbuf */
  u32 vlib_buffer_free_list_index;

  /*
   * format interface names ala xxxEthernet%d/%d/%d instead of
   * xxxEthernet%x/%x/%x. For VIRL.
   */
  u8 interface_name_format_decimal;


  /* dpdk worker "threads" */
  dpdk_worker_t * workers;

  /* Config stuff */
  u8 ** eal_init_args;
  u8 * eth_if_blacklist;
  u8 * eth_if_whitelist;
  u8 * uio_driver_name;
  u8 no_multi_seg;

  /* Required config parameters */
  u8 coremask_set_manually;
  u8 nchannels_set_manually;
  u32 coremask;
  u32 nchannels;
  u32 num_mbufs;
  u32 use_rss;
  u8 num_kni; /* while kni_init allows u32, port_id in callback fn is only u8 */

  /* Ethernet input node index */
  u32 ethernet_input_node_index;

  /* dpdk i/o thread initialization barrier */
  volatile u32 io_thread_release;

  /* pcap tracing [only works if (CLIB_DEBUG > 0)] */
  int tx_pcap_enable;
  pcap_main_t pcap_main;
  u8 * pcap_filename;
  u32 pcap_sw_if_index;
  u32 pcap_pkts_to_capture;

  /* virtio vhost-user switch */
  u8 use_virtio_vhost;

  /* vhost-user coalescence frames config */
  u32 vhost_coalesce_frames;
  f64 vhost_coalesce_time;

  /* hashes */
  uword * dpdk_device_by_kni_port_id;
  uword * vu_sw_if_index_by_listener_fd;
  uword * vu_sw_if_index_by_sock_fd;
  u32 * vu_inactive_interfaces_device_index;

  u32 next_vu_if_id;

  /* efd (early-fast-discard) settings */
  dpdk_efd_t efd;

  /*
   * flag indicating that a posted admin up/down
   * (via post_sw_interface_set_flags) is in progress
   */
  u8 admin_up_down_in_progress;

  u8 have_io_threads;

  /* which cpus are running dpdk-input */
  int input_cpu_first_index;
  int input_cpu_count;

  /* convenience */
  vlib_main_t * vlib_main;
  vnet_main_t * vnet_main;
} dpdk_main_t;

dpdk_main_t dpdk_main;

typedef enum {
  DPDK_RX_NEXT_IP4_INPUT,
  DPDK_RX_NEXT_IP6_INPUT,
  DPDK_RX_NEXT_MPLS_INPUT,
  DPDK_RX_NEXT_ETHERNET_INPUT,
  DPDK_RX_NEXT_DROP,
  DPDK_RX_N_NEXT,
} dpdk_rx_next_t;

void vnet_buffer_needs_dpdk_mb (vlib_buffer_t * b);

void dpdk_set_next_node (dpdk_rx_next_t, char *);

typedef void (*dpdk_io_thread_callback_t) (vlib_main_t *vm);

void dpdk_io_thread (vlib_worker_thread_t * w,
                     u32 instances,
                     u32 instance_id,
                     char *worker_name,
                     dpdk_io_thread_callback_t callback);
void dpdk_thread_input (dpdk_main_t * dm, dpdk_device_t * xd);

clib_error_t * dpdk_port_setup (dpdk_main_t * dm, dpdk_device_t * xd);

void dpdk_set_flowcontrol_callback (vlib_main_t *vm, 
                                    dpdk_flowcontrol_callback_t callback);

u32 dpdk_interface_tx_vector (vlib_main_t * vm, u32 dev_instance);

vlib_frame_queue_elt_t * vlib_get_handoff_queue_elt (u32 vlib_worker_index);

u32 dpdk_get_handoff_node_index (void);

void set_efd_bitmap (u8 *bitmap, u32 value, u32 op);

#define foreach_dpdk_error						\
  _(NONE, "no error")							\
  _(RX_PACKET_ERROR, "Rx packet errors")				\
  _(RX_BAD_FCS, "Rx bad fcs")						\
  _(L4_CHECKSUM_ERROR, "Rx L4 checksum errors")				\
  _(IP_CHECKSUM_ERROR, "Rx ip checksum errors")				\
  _(RX_ALLOC_FAIL, "rx buf alloc from free list failed")		\
  _(RX_ALLOC_NO_PHYSMEM, "rx buf alloc failed no physmem")		\
  _(RX_ALLOC_DROP_PKTS, "rx packets dropped due to alloc error")        \
  _(IPV4_EFD_DROP_PKTS, "IPV4 Early Fast Discard rx drops")             \
  _(IPV6_EFD_DROP_PKTS, "IPV6 Early Fast Discard rx drops")             \
  _(MPLS_EFD_DROP_PKTS, "MPLS Early Fast Discard rx drops")             \
  _(VLAN_EFD_DROP_PKTS, "VLAN Early Fast Discard rx drops")

typedef enum {
#define _(f,s) DPDK_ERROR_##f,
  foreach_dpdk_error
#undef _
  DPDK_N_ERROR,
} dpdk_error_t;

/*
 * Increment EFD drop counter
 */
static_always_inline
void increment_efd_drop_counter (vlib_main_t * vm, u32 counter_index, u32 count)
{
   vlib_node_t *my_n;

   my_n = vlib_get_node (vm, dpdk_input_node.index);
   vm->error_main.counters[my_n->error_heap_index+counter_index] += count;
}

void dpdk_update_link_state (dpdk_device_t * xd, f64 now);
void dpdk_efd_update_counters(dpdk_device_t *xd, u32 n_buffers, u16 enabled);
u32 is_efd_discardable(vlib_thread_main_t *tm,
                       vlib_buffer_t * b0,
                       struct rte_mbuf *mb);

/* dpdk vhost-user interrupt management */
u8 dpdk_vhost_user_want_interrupt (dpdk_device_t *xd, int idx);
void dpdk_vhost_user_send_interrupt (vlib_main_t * vm, dpdk_device_t * xd,
                                    int idx);


static inline u64 vnet_get_aggregate_rx_packets (void)
{
    dpdk_main_t * dm = &dpdk_main;
    u64 sum = 0;
    dpdk_worker_t * dw;

    vec_foreach(dw, dm->workers)
        sum += dw->aggregate_rx_packets;

    return sum;
}

void dpdk_rx_trace (dpdk_main_t * dm,
                    vlib_node_runtime_t * node,
                    dpdk_device_t * xd,
                    u16 queue_id,
                    u32 * buffers,
                    uword n_buffers);

#define EFD_OPERATION_LESS_THAN          0
#define EFD_OPERATION_GREATER_OR_EQUAL   1

void efd_config(u32 enabled,
                u32 ip_prec,  u32 ip_op,
                u32 mpls_exp, u32 mpls_op,
                u32 vlan_cos, u32 vlan_op);

void post_sw_interface_set_flags (vlib_main_t *vm, u32 sw_if_index, u32 flags);

typedef struct vhost_user_memory vhost_user_memory_t;

void dpdk_vhost_user_process_init (void **ctx);
void dpdk_vhost_user_process_cleanup (void *ctx);
uword dpdk_vhost_user_process_if (vlib_main_t *vm, dpdk_device_t *xd, void *ctx);

// vhost-user calls
int dpdk_vhost_user_create_if (vnet_main_t * vnm, vlib_main_t * vm,
                              const char * sock_filename,
                              u8 is_server,
                              u32 * sw_if_index,
                              u64 feature_mask,
                              u8 renumber, u32 custom_dev_instance);
int dpdk_vhost_user_modify_if (vnet_main_t * vnm, vlib_main_t * vm,
                              const char * sock_filename,
                              u8 is_server,
                              u32 sw_if_index,
                              u64 feature_mask,
                              u8 renumber, u32 custom_dev_instance);
int dpdk_vhost_user_delete_if (vnet_main_t * vnm, vlib_main_t * vm,
                              u32 sw_if_index);
int dpdk_vhost_user_dump_ifs (vnet_main_t * vnm, vlib_main_t * vm,
                             vhost_user_intf_details_t **out_vuids);

u32 dpdk_get_admin_up_down_in_progress (void);

uword
dpdk_input_rss (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * f);

#endif /* __included_dpdk_h__ */